def main(args): (training_file, label_file, test_file, u_file, e, c, output_file, components) = args X_training = load_feat(training_file) n = len(X_training) U = load_feat(u_file) y_training = [float(line.strip()) for line in open(label_file)] U = np.asarray(U) X_training = np.asarray(X_training) #X = preprocessing.normalize(X, norm='l2') y_training = np.asarray(y_training) X_test = load_feat(test_file) y_test = [float(line.strip()) for line in open(test_label)] X_test = np.asarray(X_test) X_test[np.isnan(X_test)] = 0.0 #test_X = preprocessing.normalize(test_X, norm='l2') y_test = np.asarray(y_test) s = min(len(X_training), len(U)) cca = CCA(n_components=components, max_iter=50) (X_cca, U_cca) = cca.fit_transform(X_training[:s], U[:s]) X_test_cca = cca.transform(X_test) svr = SVR(C=c, epsilon=e, kernel='rbf') svr.fit(X_cca, y_training[:s]) pred = svr.predict(X_test_cca) with open(output_file, 'w') as output: for p in pred: print >>output, p return
def CCA_transform(train_feature, train_label, test_feature, n_components): """ CCA: Canonical Correlation Analysis """ from sklearn.cross_decomposition import CCA cca = CCA(n_components).fit(train_feature, train_label) train_feature_transformed = cca.transform(train_feature) test_feature_transformed = cca.transform(test_feature) return train_feature_transformed, test_feature_transformed
def canonical_approach(): from sklearn.cross_decomposition import CCA (X, Y), cities = pull_xy_data() cca = CCA(n_components=2) cca.fit(X, Y) ccaX, ccaY = cca.transform(X, Y) plot(ccaX, cities, ["CC01", "CC02", "CC03"], 1) return "OK What Now?"
def __init__(self, dataset, n=None, tol=1e-4): if n is None: n = int(numpy.ceil(numpy.sqrt(len(dataset.attributes)))) self.dataset = dataset self.attributes = random.sample(dataset.attributes, n) cca = CCA(n_components=1, tol=tol) cca.fit( dataset.X.take([a.index for a in self.attributes], 1), dataset.y) self.linear_combination = LinearCombination( self.attributes, cca.x_weights_.transpose()[0])
def cca_for_ssvep(input_data, sampling_rate, compared_frequencies): # TODO: Strick input checks, exceptions and avoid crashing and processing errors # Pre-allocate SSVEP signals matrix to be compared with original EEG recordings using CCA number_time_points = input_data.shape[1] number_harmonics = 2 cca_base_signal_matrix = [[] for loop_var in compared_frequencies] # Pre-allocate output: one correlation coefficient (Rho) for each target SSVEP frequency # Note: Row 1 is for default Rho scores, Row 2 is for the Rho scores After cca transformation cca_rho_values = numpy.zeros([1, len(compared_frequencies)], dtype='float') # For each target frequency, fill Y matrix with sine and cosine signals for every harmonic for loop_frequencies in range(len(compared_frequencies)): # For this current SSVEP frequency, pre-allocate the harmonics matrix cca_base_signal_matrix[loop_frequencies] = numpy.zeros([number_harmonics * 2, number_time_points]) time_points_count = numpy.arange(number_time_points, dtype='float') time_points_count = time_points_count / sampling_rate # Generate sine and cosine reference signals, for every harmonic for loop_harmonics in range(number_harmonics): # Compute the reference signals for current harmonic base_constant = 2 * numpy.pi * (loop_harmonics + 1) * compared_frequencies[loop_frequencies] base_sine_signal = numpy.sin((base_constant * time_points_count)) base_cosine_signal = numpy.cos((base_constant * time_points_count)) # Copy signals back to reference matrix base_position = loop_harmonics + 1 sine_position = (2 * (base_position - 1) + 1) cosine_position = 2 * base_position cca_base_signal_matrix[loop_frequencies][sine_position - 1, :] = base_sine_signal cca_base_signal_matrix[loop_frequencies][cosine_position - 1, :] = base_cosine_signal # After the loop, extract the y_matrix from reference matrix for current SSVEP frequency y_matrix = cca_base_signal_matrix[loop_frequencies] # Create a CCA object and compute the correlation score cca_object = CCA(n_components=number_harmonics) cca_object.fit(numpy.transpose(input_data), numpy.transpose(y_matrix)) values_x, values_y = cca_object.transform(input_data, y_matrix) cca_rho_values[0, loop_frequencies] = cca_object.score(input_data, y_matrix, values_y) # Score = Rho value? # After loop return and exit return cca_rho_values
def fit_CCA(tr_block,data_builder): '''We fit a CCA to some 100 odd points??? ''' # train on number of points num_points = 100 PixelPoints = data_builder.sample_random_pixels() points_array_ipw = [] points_array_refl = [] for yr in [14,15]: doy_strings = data_builder.club_days(tr_block[tr_block[0][:,1] == yr]) days_in_sorted = doy_strings.keys() days_in_sorted.sort() ipw_files,refl_files = data_builder.sort_IPW_refl_files_imgs(yr) for set_ in days_in_sorted: print 'Building data set for year: %d and string of days %s'%(yr,set_) # Get the required files only temp_ipw_files = filter(lambda x: re.findall('\d+',x)[1] in doy_strings[set_],ipw_files) temp_refl_files = filter(lambda x: re.findall('\d+',x)[1] in doy_strings[set_],refl_files) temp_ipw_files = map(lambda x: '../data/dataset/20' + str(yr) + os.sep + x,temp_ipw_files) temp_refl_files = map(lambda x: '../data/dataset/20' + str(yr) + os.sep + x,temp_refl_files) for x_,y_ in zip(PixelPoints[:num_points,0],PixelPoints[:num_points,1]): temp_array = data_builder.build_features_and_truth_imgs(temp_ipw_files,temp_refl_files,x_,y_) points_array_ipw.append(temp_array[1]) points_array_refl.append(temp_array[2]) X_ = np.vstack(points_array_ipw) Y_ = np.vstack(points_array_refl) mdl = CCA(n_components = 10) print 'Fitting a CCA...' mdl.fit(X_[:,:1089],Y_[:,:1089]) ipw_frames = X_[:,2178:-1] refl_frames = Y_[:,2178:] del X_ del Y_ ipw_frames = ipw_frames[~np.any(np.isnan(ipw_frames),axis = 1),:] refl_frames = refl_frames[~np.any(np.isnan(refl_frames),axis = 1),:] # indices = [(x*1089,(x+1)*1089)for x in range(4) ] # # the number of components times 4 # ipw_refl_fusion = np.zeros((ipw_frames.shape[0],80)) print 'Building the feature fusion..' return mdl
def mainExec(name_file1, name_file2, features1, features2): ''' Given two files with names, and two files with features, perform the Stacked Auxiliary Embedding method on two matrices. The first one is the concatenation of both feature lists, the second matrix contains tf-idf weighted representations of the training sentences of Flickr30kEntities. The intermediate CCA model is written to disk, as well as the final model :param name_file1 :param name_file2 :param features1 :param features2 ''' print "Creating vocabulary" voc = readVocabulary() print "Generating document vectors" occurrenceVectors, idf = createOccurrenceVectors(voc) print "Weighing vectors" weightedVectors = weight_tfidf(occurrenceVectors, idf) print "creating feature dictionary" featuresDict = createFeatDict(weightedVectors.keys(), name_file1, name_file2, features1, features2 ) imagematrix, sentenceMatrix = createSnippetMatrices(featuresDict, weightedVectors) print "Modelling cca" cca = CCA(n_components = 128) cca = fitCCA(cca, imagematrix, sentenceMatrix, "ccasnippetmodel.p") trainingimages, trainingsentences = createTrainMatrices(voc) trans_img, trans_sent = cca.transform(trainingimages, trainingsentences) nn_img = nearest_neighbor(trainingimages) nn_sent = nearest_neighbor(trainingsentences) print "NN Image: " + str(nn_img) print "NN Sentence: " + str(nn_sent) augmented_imgs, augmented_sentences = augmentMatrices(nn_img, nn_sent, trainingimages, trainingsentences, trans_img, trans_sent) print "Fitting augmented CCA model" augmentedcca = CCA(n_components=96) augmentedcca = fitCCA(augmentedcca, augmented_imgs, augmented_sentences, "augmentedcca.p") print "Writing the model to disk" resultingModel = StackedCCAModel(nn_img, nn_sent, cca, augmentedcca) pickle.dump(resultingModel, open("completestackedCCAModel.p", 'w+'))
def main(args): (training_file, label_file, test_file, test_label, u_file) = args X_training = load_feat(training_file) n = len(X_training) U = load_feat(u_file) y_training = [int(line.strip()) for line in open(label_file)] U = np.asarray(U) X_training = np.asarray(X_training) #X = preprocessing.normalize(X, norm='l2') y_training = np.asarray(y_training) X_test = load_feat(test_file) y_test = [int(line.strip()) for line in open(test_label)] X_test = np.asarray(X_test) #test_X = preprocessing.normalize(test_X, norm='l2') y_test = np.asarray(y_test) cca = CCA(n_components=100) (X_cca, U_cca) = cca.fit_transform(X_training, U[:n]) X_test_cca = cca.predict(X_test) svr = SVC() svr.fit(X_cca, y_training) pred = svr.predict(X_test_cca) print pred print test_y print accuracy_score(y_test, pred) with open(test_file + '.cca.2.pred', 'w') as output: for p in pred: print >>output, p #svm_model.fit(X, y) #pickle.dump(lr, open(model_file, "wb")) return return
def test_cca_implementation(): X = np.random.multivariate_normal(np.random.randint(50,100,(10)).astype('float'),np.identity(10),200) Y = np.random.multivariate_normal(np.random.randint(80,200,(6)).astype('float'),np.identity(6),200) X_test = np.random.multivariate_normal(np.random.randint(50,100,(10)).astype('float'),np.identity(10),20) Y_test = np.random.multivariate_normal(np.random.randint(50,100,(6)).astype('float'),np.identity(6),20) mdl_test = CCA(n_components = 6) mdl_test.fit(X,Y) Y_pred = mdl_test.predict(X) print Y_pred print '-'*50 # print Y_test from sklearn.cross_decomposition import CCA as CCA_sklearn mdl_actual = CCA_sklearn(n_components = 6) mdl_actual.fit(X,Y) print '-'*50 Y_actual = mdl_actual.predict(X) print Y_actual
n = 500 # 2 latents vars: l1 = np.random.normal(size=n) l2 = np.random.normal(size=n) latents = np.array([l1, l1, l2, l2]).T X = latents + np.random.normal(size=4 * n).reshape((n, 4)) Y = latents + np.random.normal(size=4 * n).reshape((n, 4)) ############################################################################### # Compare the projection on first component of CCA, kernel CCA # with linear kernel, polynomial kernel and rbf kernel cca = CCA(n_components=1) cca.fit(X, Y) r_cca = np.corrcoef(cca.x_scores_.T, cca.y_scores_.T)[0, 1] # linear kernel CCA kcca1 = KernelCCA(kernel="linear", n_components=1, kapa=0.1, eta=0.1, pgso=True, center=True) kcca1.fit(X, Y) kx_linear_scores = np.dot(kcca1.KXc_, kcca1.alphas_) ky_linear_scores = np.dot(kcca1.KYc_, kcca1.betas_) # polynomial kernel CCA kcca2 = KernelCCA(kernel="poly", n_components=1, kapa=0.1, eta=0.1, pgso=True, center=True, coef0=0.1) kcca2.fit(X, Y) kx_poly_scores = np.dot(kcca2.KXc_, kcca2.alphas_)
X, good_idx = remove_outliers(X, 6.0) y = y.ix[y.index[good_idx]] # sanity check # idx = np.random.permutation(len(y))[0] # idx = np.where(y.index == 119384)[0][0] # image_sanity_check(y.index[idx], X[idx]) # only keep unique values unique_cols = ['Class1.1', 'Class1.2', 'Class2.1', 'Class3.1', 'Class4.1', 'Class5.1', 'Class5.2', 'Class5.3', 'Class6.1', 'Class7.1', 'Class7.2', 'Class8.1', 'Class8.2', 'Class8.3', 'Class8.4', 'Class8.5', 'Class8.6', 'Class9.1', 'Class9.2', 'Class10.1', 'Class10.2', 'Class11.1', 'Class11.2', 'Class11.3', 'Class11.4', 'Class11.5'] # do CCA if verbose: print 'Doing CCA...' cca = CCA(n_components=len(unique_cols), copy=False) X_cca, y_cca = cca.fit_transform(X, y[unique_cols].values.astype(np.float32)) cPickle.dump(cca, open(base_dir + 'data/CCA_DCT.pickle', 'wb')) # make plots make_cca_images(cca, (100, 100), dct_idx=dct_idx) fig = plot_cca_projections(X_cca) fig.savefig(plot_dir + 'CCA_dist_no_outliers.png') if doshow: plt.show() print 'Saving the transformed values...' np.save(base_dir + 'data/CCA_training_transform', X_cca)
Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print("True B (such that: Y = XB + Err)") print(B) # compare pls2.coef_ with B print("Estimated B") print(np.round(pls2.coef_, 1)) pls2.predict(X) # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 pls1 = PLSRegression(n_components=3) pls1.fit(X, y) # note that the number of components exceeds 1 (the dimension of y) print("Estimated betas") print(np.round(pls1.coef_, 1)) # ############################################################################# # CCA (PLS mode B with symmetric deflation) cca = CCA(n_components=2) cca.fit(X_train, Y_train) X_train_r, Y_train_r = cca.transform(X_train, Y_train) X_test_r, Y_test_r = cca.transform(X_test, Y_test)
pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print("True B (such that: Y = XB + Err)") print(B) # compare pls2.coef_ with B print("Estimated B") print(np.round(pls2.coef_, 1)) pls2.predict(X) ############################################################################### # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 pls1 = PLSRegression(n_components=3) pls1.fit(X, y) # note that the number of compements exceeds 1 (the dimension of y) print("Estimated betas") print(np.round(pls1.coef_, 1)) ############################################################################### # CCA (PLS mode B with symmetric deflation) cca = CCA(n_components=2) cca.fit(X_train, Y_train) X_train_r, Y_train_r = plsca.transform(X_train, Y_train) X_test_r, Y_test_r = plsca.transform(X_test, Y_test)
# session.execute("CREATE KEYSPACE IF NOT EXISTS TweetsXiaohu WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 3 };") session.execute("USE TweetsXiaohu") # session.execute("DROP TABLE IF EXISTS Tweet") rows = session.execute("SELECT text, hashtags FROM Tweet limit 1000") X, Y = [], [] for row in rows: X.append(row.text) Y.append([x.lower() for x in row.hashtags]) vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english', decode_error='ignore') # print(vectorizer) X = vectorizer.fit_transform(X).toarray() # print '40', X # print type(X) Y_indicator = LabelBinarizer().fit(Y).transform(Y) cca = CCA(n_components = 100, max_iter=10) cca.fit(X, Y_indicator) X = cca.transform(X) # print '45', X # print type(X) classif = OneVsRestClassifier(SVC(kernel='linear')) classif.fit(X, Y) for row in rows: # row = rows[0] # print vectorizer.transform([row.text]).toarray() # print cca.predict(vectorizer.transform([row.text]).toarray()) transformed = vectorizer.transform([row.text]).toarray() # print '55', transformed ccad = cca.transform(transformed) # print '57', ccad
def main(): sess = tf.InteractiveSession() X1_data, X2_data, Y_data, baseline_data, labels_data = read_inputs() # set up the DCCA network keep_input = tf.placeholder("float") keep_hidden = tf.placeholder("float") X1_in, X1_out = build_network(273, 1500, 1500, 1500, 50, keep_input, keep_hidden) X2_in, X2_out = build_network(112, 1500, 1500, 1500, 50, keep_input, keep_hidden) # define the DCCA cost function U = tf.placeholder("float", [50, 40]) V = tf.placeholder("float", [50, 40]) UtF = tf.matmul(tf.transpose(U), tf.transpose(X1_out)) GtV = tf.matmul(X2_out, V) canon_corr = tf.mul(1./BATCH, tf.reduce_sum(tf.mul(tf.matmul(UtF, GtV), tf.constant(np.eye(40), dtype = tf.float32)))) corr_step = tf.train.AdamOptimizer(1e-6).minimize(- canon_corr) sess.run(tf.initialize_all_variables()) # train the network print "Training DCCA" for i in range(0, EPOCHS): for j in range(0, len(X1_data.train), int(BATCH)): X1_in_batch = X1_data.train[j:(j + BATCH)] X2_in_batch = X2_data.train[j:(j + BATCH)] X1_out_batch = X1_out.eval(feed_dict = { X1_in : X1_in_batch, keep_input : 1.0, keep_hidden : 1.0}) X2_out_batch = X2_out.eval(feed_dict = { X2_in : X2_in_batch, keep_input : 1.0, keep_hidden : 1.0}) # compute CCA on the output layers cca = CCA(n_components = 40) cca.fit(X1_out_batch, X2_out_batch) U_batch = cca.x_weights_ V_batch = cca.y_weights_ # perform gradient step corr_step.run(feed_dict = { X1_in : X1_in_batch, X2_in : X2_in_batch, U : U_batch, V : V_batch, keep_input : 0.9, keep_hidden : 0.8}) # print useful info print "EPOCH", i, "/ COST", canon_corr.eval(feed_dict = { X1_in : X1_in_batch, X2_in : X2_in_batch, U : U_batch, V : V_batch, keep_input : 1.0, keep_hidden : 1.0}) # train the softmax classifier print "Training softmax" W_s = weight_variable([89, 39]) b_s = bias_variable([39]) baseline = tf.placeholder("float", [None, 39]) y_true = tf.placeholder("float", [None, 39]) # define the cost X1_baseline_combo = tf.concat(1, [X1_out, baseline]) y_pred = tf.nn.softmax(tf.matmul(X1_baseline_combo, W_s) + b_s) lr_cost = - tf.reduce_sum(y_true * tf.log(tf.clip_by_value(y_pred, 1e-10, 1.0))) lr_step = tf.train.AdamOptimizer(1e-4).minimize(lr_cost) # set up accuracy checking correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.argmax(y_true, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) sess.run(tf.initialize_all_variables()) for i in range(0, EPOCHS): for j in range(0, len(X1_data.train), int(BATCH)): lr_step.run(feed_dict = { X1_in : X1_data.train[j:(j + BATCH)], y_true : Y_data.train[j:(j + BATCH)], baseline : baseline_data.train[j:(j + BATCH)], keep_input : 1.0, keep_hidden : 1.0}) print i, accuracy.eval(feed_dict = { X1_in : X1_data.dev, y_true : Y_data.dev, baseline : baseline_data.dev, keep_input : 1.0, keep_hidden : 1.0}) print "Test accuracy:", accuracy.eval(feed_dict = { X1_in : X1_data.test, y_true : Y_data.test, baseline : baseline_data.test, keep_input : 1.0, keep_hidden : 1.0}) # project the data and print it to file X1_train_proj = X1_baseline_combo.eval(feed_dict = { X1_in : X1_data.train, baseline : baseline_data.train, keep_input : 1.0, keep_hidden : 1.0}) X1_dev_proj = X1_baseline_combo.eval(feed_dict = { X1_in : X1_data.dev, baseline : baseline_data.dev, keep_input : 1.0, keep_hidden : 1.0}) X1_test_proj = X1_baseline_combo.eval(feed_dict = { X1_in : X1_data.test, baseline : baseline_data.test, keep_input : 1.0, keep_hidden : 1.0}) scipy.io.savemat("dcca_projected_data.mat", {'dataTr' : X1_train_proj, "PhonesTr" : labels_data.train, "dataDev" : X1_dev_proj, "PhonesDev" : labels_data.dev, "dataTest" : X1_test_proj, "PhonesTest" : labels_data.test})
dir_name = configuration.output_parameters['path'] if not os.path.isdir(dir_name): os.makedirs(dir_name) OutputLog().set_path(dir_name) OutputLog().set_verbosity(configuration.output_parameters['verbosity']) data_config = ConfigParser.ConfigParser() data_config.read(data_set_config) data_parameters = ConfigSectionMap("dataset_parameters", data_config) # construct data set data_set = Container().create(data_parameters['name'], data_parameters) cca_model = CCA(n_components=top, scale=True, copy=False) train_transformed_x, train_transformed_y = cca_model.fit_transform(data_set.trainset[0], data_set.trainset[1]) test_transformed_x, test_transformed_y = cca_model.transform(data_set.testset[0], data_set.testset[1]) OutputLog().write('test results:') correlations, trace_correlation, var, x_test, y_test, test_best_layer = TraceCorrelationTester( data_set.testset[0], data_set.testset[1], top).test(IdentityTransformer(), configuration.hyper_parameters) OutputLog().write('train results:') correlations, train_trace_correlation, var, x_train, y_train, train_best_layer = TraceCorrelationTester( data_set.trainset[0], data_set.trainset[1], top).test(IdentityTransformer(), configuration.hyper_parameters) OutputLog().write('\nTest results : \n')
# check type of array #print(np.dtype(data_selection)) # force dtype = float32 data_selection = data_selection.astype(np.float32, copy=False) # complete cases data_selection = data_selection[~np.isnan(data_selection).any(axis=1)] data_selection = data_selection[np.isfinite(data_selection).any(axis=1)] # target variable / covariates y = data_selection[:,0:3] x = data_selection[:,4:] # split test-train x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y, test_size=0.2, random_state=0) cca = CCA(n_components=1,scale=True) cca.fit(x_train, y_train) #CCA(copy=True, max_iter=500, n_components=1, scale=True, tol=1e-06), X_train_r, Y_train_r = cca.transform(x_train,y_train) X_test_r, Y_test_r = cca.transform(x_test, y_test) print(type(X_train_r)) print(np.shape(X_train_r)) print(np.shape(Y_train_r)) print(np.shape(x)) print(np.corrcoef(X_train_r[:,0],Y_train_r[:,0])) print(np.corrcoef(X_test_r[:,0],Y_test_r[:,0]))
#%% Plot accuracies for PLSSVD plt.figure() for i in range (5): plt.plot(nComponents,plsRegScores[i,:],lw=3) plt.xlim(1,np.amax(nComponents)) plt.title('PLS Regression accuracy') plt.xlabel('Number of components') plt.ylabel('accuracy') plt.legend (['LR','LDA','GNB','Linear SVM','rbf SVM'],loc='lower right') plt.grid(True) if (0): #%% Canonical Correlation Analysis nComponents = np.arange(1,nClasses +1) cca = CCA(n_components=nClasses) cca.fit(Xtrain,Ytrain) XtrainT = cca.transform(Xtrain) XtestT = cca.transform(Xtest) ccaScores = np.zeros((5,np.alen(nComponents))) for i,n in enumerate(nComponents): ccaScores[:,i] = util.classify(XtrainT[:,0:n],XtestT[:,0:n],labelsTrain,labelsTest) cca = CCA(n_components=3) cca.fit(Xtrain,Ytrain) xt = cca.transform(Xtrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) plt.title('First 3 components of projected data')
__author__ = 'cancobanoglu' ''' CCA is Canonical Correlation Analysis ''' print(__doc__) from sklearn.cross_decomposition import CCA from sklearn import datasets X = [[0., 0., 1.], [1., 0., 0.], [2., 2., 2.], [3., 5., 4.]] Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]] cca = CCA(n_components=1) cca.fit(X, Y) CCA(copy=True, max_iter=500, n_components=1, scale=True, tol=1e-06) X_c, Y_c = cca.transform(X, Y)
class CCA_Model: def __init__(self,n_components): self.n_components = n_components self.cca = CCA(n_components=n_components) self.ntop = 10 def learn_model(self,X_chanel, Y_chanel,Y_Distinct=None): """ :param X_chanel: array-like for X chanel :param Y_chanel: array-line for Y chanel :return: """ print "Start learning..." self.x_dim = len(X_chanel[0]) self.y_dim = len(Y_chanel[0]) self.cca.fit(X_chanel,Y_chanel) if Y_Distinct == None: self.X_transform ,self.Y_transform = self.cca.transform(X_chanel,Y_chanel) else: self.X_transform ,self.Y_transform = self.cca.transform(X_chanel,Y_Distinct) print "Learning completed" def get_bet_match_index_transform_x2y(self,x_transform): shape = self.Y_transform.shape scores = np.ndarray(shape[0],dtype=float) for i in xrange(shape[0]): scores[i] = np.dot(self.Y_transform[i],x_transform) #scores[i] = entropy(x_transform,self.Y_transform[i]) indices = (-scores).argsort()[:self.ntop] return [indices, scores[indices]] def get_bet_match_index_transform_y2x(self,y_transform): shape = self.X_transform.shape scores = np.ndarray(shape[0], dtype=float) for i in xrange(shape[0]): scores[i] = np.dot(self.X_transform[i], y_transform) #scores[i] = entropy(y_transform,self.X_transform[i]) indices = (-scores).argsort()[:self.ntop] return [indices, scores[indices]] def get_best_match_cross_indices_x2y(self,x_inputs): x_transformes = self.cca.transform(x_inputs) results = [] for x_transform in x_transformes: results.append(self.get_bet_match_index_transform_x2y(x_transform)) return results def get_best_match_cross_indices_y2x(self,y_inputs): _, y_transformes = self.cca.transform([[0 for i in xrange(self.x_dim)]],y_inputs) results = [] for y_transform in y_transformes: results.append(self.get_bet_match_index_transform_y2x(y_transform)) return results
def mainExec(name_file, features): ''' Based on a list of image names and image features, learn a CCA model based on Stacked Auxiliary Embedding and save this model to disk. :param name_file :param features :return: ''' print "Creating vocabulary" voc = readVocabulary() print "Generating document vectors" occurrenceVectors, idf = createOccurrenceVectors(voc) print "Weighing vectors" weightedVectors = weight_tfidf(occurrenceVectors, idf) sentenceMatrix = [] imagematrix = [] print "Creating matrices" currentSentence = 0 for i in weightedVectors.keys(): if isLargeEnough(i): currentSentence += 1 print "current Sentence: " + str(currentSentence) for j in range(len(weightedVectors[i])): weightedVectors[i][j] = float(weightedVectors[i][j]) if currentSentence == 1: sentenceMatrix = weightedVectors[i] imagematrix = getImage(i,name_file, features) elif currentSentence ==2: sentenceMatrix = np.concatenate(([sentenceMatrix], [weightedVectors[i]]), axis = 0) imagematrix = np.concatenate(([imagematrix], [getImage(i,name_file, features)]), axis = 0) else: sentenceMatrix = np.concatenate((sentenceMatrix, [weightedVectors[i]]), axis = 0) imagematrix = np.concatenate((imagematrix, [getImage(i,name_file, features)]), axis = 0) print "Modelling cca" cca = CCA(n_components=128) cca.fit(sentenceMatrix, imagematrix) pickle.dump(cca, open("ccasnippetmodel.p",'w+')) idf = np.zeros(len(voc)) trainingimages = [] trainingsentences = [] dp = getDataProvider('flickr30k') currentPair = 0 for pair in dp.sampleImageSentencePair(): currentPair += 1 if currentPair % 100 == 0: print "Current pair: " + str(currentPair) img = pair['image']['feat'] trainingimages.append(img) sentence = getFullSentence(pair) for i in range(len(sentence)): if sentence[i] > 0: idf[i] += 1 trainingsentences.append(sentence) for i in range(len(trainingsentences)): trainingsentences[i] = trainingsentences[i]*idf trans_img, trans_sent = cca.transform(trainingimages, trainingsentences) nn_img = nearest_neighbor(trainingimages) nn_sent = nearest_neighbor(trainingsentences) augmented_imgs = [] augmented_sentences = [] for i in range(len(trans_img)): augm_img = trainingimages[i].extend(phi(3000,nn_img, trans_img[i])) augmented_imgs.append(augm_img) for i in range(len(trans_sent)): augm_sent = trainingsentences[i].extend(phi(3000, nn_sent, trans_sent[i])) augmented_sentences.append(augm_sent) augmentedcca = CCA(n_components= 96) augmentedcca.fit(augmented_sentences, augmented_imgs) pickle.dump(cca, open("augmentedcca.p",'w+'))
def __init__(self,n_components): self.n_components = n_components self.cca = CCA(n_components=n_components) self.ntop = 10
[ 156, 33, 54, 15, 225, 73], [ 138, 33, 68, 2, 110, 43] ] print X.shape #X = N.array(Z)[:,0:3].tolist() #Y = N.array(Z)[:,3:6].tolist() print 'X=\n',X print 'Y=\n',Y Rx = N.corrcoef(X.T) Ry = N.corrcoef(Y.T) cca = CCA(n_components=1) cca.fit(X, Y) print "Rx:\n", Rx print "Ry:\n", Ry print "x_weights:\n", cca.x_weights_ print "y_weights:\n", cca.y_weights_ print "x_loadings:\n", cca.x_loadings_ print "y_loadings:\n", cca.y_loadings_ print "x_scores_:\n", cca.x_scores_ print "y_scores_:\n", cca.y_scores_ loadings_man_x = N.dot(Rx, cca.x_weights_) loadings_man_y = N.dot(Ry, cca.y_weights_) print "loadings_man_x:\n",loadings_man_x print "loadings_man_y:\n",loadings_man_y
# each Yj = 1*X1 + 2*X2 + noize Y = np.dot(X, B) + np.random.normal(size=n * q).reshape((n, q)) + 5 pls2 = PLSRegression(n_components=3) pls2.fit(X, Y) print("True B (such that: Y = XB + Err)") print(B) # compare pls2.coef_ with B print("Estimated B") print(np.round(pls2.coef_, 1)) pls2.predict(X) # PLS regression, with univariate response, a.k.a. PLS1 n = 1000 p = 10 X = np.random.normal(size=n * p).reshape((n, p)) y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size=n * 1) + 5 pls1 = PLSRegression(n_components=3) pls1.fit(X, y) # note that the number of components exceeds 1 (the dimension of y) print("Estimated betas") print(np.round(pls1.coef_, 1)) # ############################################################################# # CCA (PLS mode B with symmetric deflation) cca = CCA(n_components=2) cca.fit(X_train, Y_train) X_train_r, Y_train_r = cca.transform(X_train, Y_train) X_test_r, Y_test_r = cca.transform(X_test, Y_test)