def FastICA_data(test_x, train_x, params): print 'centering data ...' center_test, center_train = center_data(test_x, train_x) print 'icaing data ...' components = int(params['components']) ica = FastICA(n_components=components, whiten=True).fit(train_x) ica_train_x = ica.transform(train_x) ica_test_x = ica.transform(test_x) return ica_test_x, ica_train_x
class ICA(method.Method): def __init__(self, params): self.params = params self.ica = FastICA(**params) def __str__(self): return "FastICA" def train(self, data): """ Train the FastICA on the withened data :param data: whitened data, ready to use """ self.ica.fit(data) def encode(self, data): """ Encodes the ready to use data :returns: encoded data with dimension n_components """ return self.ica.transform(data) def decode(self, components): """ Decode the data to return whitened reconstructed data :returns: reconstructed data """ return self.ica.inverse_transform(components)
def best_ica_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ica = FastICA(n_components=X_train_scl.shape[1]) X_train_transformed = ica.fit_transform(X_train_scl, y_train) X_test_transformed = ica.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:,0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:,0:2] # save filename = './' + self.save_dir + '/nba_ica_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_ica_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_ica_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_ica_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def main(mode): path = "/local/attale00/extracted_pascal__4__Multi-PIE" path_ea = path + "/color128/" allLabelFiles = utils.getAllFiles("/local/attale00/a_labels") labeledImages = [i[0:16] + ".png" for i in allLabelFiles] # labs=utils.parseLabelFiles(path+'/Multi-PIE/labels','mouth',labeledImages,cutoffSeq='.png',suffix='_face0.labels') labs = utils.parseLabelFiles( "/local/attale00/a_labels", "mouth", labeledImages, cutoffSeq=".png", suffix="_face0.labels" ) testSet = fg.dataContainer(labs) roi = (50, 74, 96, 160) X = fg.getAllImagesFlat(path_ea, testSet.fileNames, (128, 256), roi=roi) # perform ICA if mode not in ["s", "v"]: ica = FastICA(n_components=100, whiten=True) ica.fit(X) meanI = np.mean(X, axis=0) X1 = X - meanI data = ica.transform(X1) filters = ica.components_ elif mode in ["s", "v"]: W = np.load("/home/attale00/Desktop/classifiers/ica/filter1.npy") m = np.load("/home/attale00/Desktop/classifiers/ica/meanI1.npy") X1 = X - m data = np.dot(X1, W.T) for i in range(len(testSet.data)): testSet.data[i].extend(data[i, :]) strel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) # fg.getHogFeature(testSet,roi,path=path_ea,ending='.png',extraMask = None,orientations = 3, cells_per_block=(6,2),maskFromAlpha=False) # fg.getColorHistogram(testSet,roi,path=path_ea,ending='.png',colorspace='lab',bins=10) testSet.targetNum = map(utils.mapMouthLabels2Two, testSet.target) rf = classifierUtils.standardRF(max_features=np.sqrt(len(testSet.data[0])), min_split=5, max_depth=40) if mode in ["s", "v"]: print "Classifying with loaded classifier" classifierUtils.classifyWithOld( path, testSet, mode, clfPath="/home/attale00/Desktop/classifiers/ica/rf128ICA_1" ) elif mode in ["c"]: print "cross validation of data" print "Scores" # print classifierUtils.standardCrossvalidation(rf,testSet,n_jobs=5) # _cvDissect(testSet,rf) classifierUtils.dissectedCV(rf, testSet) print "----" elif mode in ["save"]: print "saving new classifier" _saveRF(testSet) else: print "not doing anything"
def align(movie_data, options, args, lrh): print 'pICA(scikit-learn)' nvoxel = movie_data.shape[0] nTR = movie_data.shape[1] nsubjs = movie_data.shape[2] align_algo = args.align_algo nfeature = args.nfeature randseed = args.randseed if not os.path.exists(options['working_path']): os.makedirs(options['working_path']) # zscore the data bX = np.zeros((nsubjs*nTR,nvoxel)) for m in range(nsubjs): for t in range(nTR): bX[nTR*m+t,:] = stats.zscore(movie_data[:,t,m].T ,axis=0, ddof=1) del movie_data np.random.seed(randseed) A = np.mat(np.random.random((nfeature,nfeature))) ica = FastICA(n_components= nfeature, max_iter=500,w_init=A,random_state=randseed) ica.fit(bX.T) R = ica.transform(bX.T) niter = 10 # initialization when first time run the algorithm np.savez_compressed(options['working_path']+align_algo+'_'+lrh+'_'+str(niter)+'.npz',\ R = R, niter=niter) return niter
def ica(tx, ty, rx, ry): compressor = ICA(whiten=True) # for some people, whiten needs to be off compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) em(newtx, ty, newrx, ry, add="wICAtr", times=10) km(newtx, ty, newrx, ry, add="wICAtr", times=10) nn(newtx, ty, newrx, ry, add="wICAtr")
def fastica(eeg_data): """ Sample function to apply `FastICA`_ to the EEG data. Parameters ---------- eeg_data : array EEG data in a CxTxE array. With C the number of channels, T the number of time samples and E the number of events. Returns ------- ica : ICA object Trained `FastICA`_ object. ica_data : array EEG projected data in a CxTxE array. With C the number of components, T the number of time samples and E the number of events. """ # Dimension shapes ch_len = eeg_data.shape[ch_dim] t_len = eeg_data.shape[t_dim] ev_len = eeg_data.shape[ev_dim] # ------------------------------------------------------------------------- # 1. Fit the FastICA model # We need to collapse time and events dimensions coll_data = eeg_data.transpose([t_dim, ev_dim, ch_dim])\ .reshape([t_len*ev_len, ch_len]) # Fit model ica = FastICA() ica.fit(coll_data) # Normalize ICs to unit norm k = np.linalg.norm(ica.mixing_, axis=0) # Frobenius norm ica.mixing_ /= k ica.components_[:] = (ica.components_.T * k).T # ------------------------------------------------------------------------- # 2. Transform data # Project data bss_data = ica.transform(coll_data) # Adjust shape and dimensions back to "eeg_data" shape ic_len = bss_data.shape[1] bss_data = np.reshape(bss_data, [ev_len, t_len, ic_len]) new_order = [0, 0, 0] # TODO: Check the following order new_order[ev_dim] = 0 new_order[ch_dim] = 2 new_order[t_dim] = 1 bss_data = bss_data.transpose(new_order) # End return ica, bss_data
def ICA(model_data, components = None, transform_data = None): t0 = time() ica = FastICA(n_components=components) if transform_data == None: projection = ica.fit_transform(model_data) else: ica.fit(model_data) projection = ica.transform(transform_data) print "ICA Time: %0.3f" % (time() - t0) return projection
def var_test_ica(flux_arr_orig, exposure_list, wavelengths, low_n=3, hi_n=100, n_step=1, show_plots=False, show_summary_plot=False, save_summary_plot=True, test_ind=7, real_time_progress=False, idstr=None): start_ind = np.min(np.nonzero(flux_arr_orig[test_ind])) end_ind = np.max(np.nonzero(flux_arr_orig[test_ind])) perf_table = Table(names=["n", "avg_diff2", "max_diff_scaled"], dtype=["i4", "f4", "f4"]) if hi_n > flux_arr_orig.shape[0]-1: hi_n = flux_arr_orig.shape[0]-1 for n in range(low_n, hi_n, n_step): ica = FastICA(n_components = n, whiten=True, max_iter=750, random_state=1234975) test_arr = flux_arr_orig[test_ind].copy() flux_arr = np.vstack([flux_arr_orig[:test_ind], flux_arr_orig[test_ind+1:]]) ica_flux_arr = flux_arr.copy() #keep back one for testing ica.fit(ica_flux_arr) ica_trans = ica.transform(test_arr.copy(), copy=True) ica_rev = ica.inverse_transform(ica_trans.copy(), copy=True) avg_diff2 = np.ma.sum(np.ma.power(test_arr-ica_rev[0],2)) / (end_ind-start_ind) max_diff_scaled = np.ma.max(np.ma.abs(test_arr-ica_rev[0])) / (end_ind-start_ind) perf_table.add_row([n, avg_diff2, max_diff_scaled]) if real_time_progress: print "n: {:4d}, avg (diff^2): {:0.5f}, scaled (max diff): {:0.5f}".format(n, avg_diff2, max_diff_scaled) if show_plots: plt.plot(wavelengths, test_arr) plt.plot(wavelengths, ica_rev[0]) plt.plot(wavelengths, test_arr-ica_rev[0]) plt.legend(['orig', 'ica', 'orig-ica']) plt.xlim((wavelengths[start_ind], wavelengths[end_ind])) plt.title("n={}, avg (diff^2)={}".format(n, avg_diff2)) plt.tight_layout() plt.show() plt.close() if show_summary_plot or save_summary_plot: plt.plot(perf_table['n'], perf_table['avg_diff2']) plt.plot(perf_table['n'], perf_table['max_diff_scaled']) plt.title("performance") plt.tight_layout() if show_summary_plot: plt.show() if save_summary_plot: if idstr is None: idstr = random.randint(1000000, 9999999) plt.savefig("ica_performance_{}.png".format(idstr)) plt.close() return perf_table
def ICA_reduction(posture, trainblock, componenet): currentdirectory = os.getcwd() # get the directory. parentdirectory = os.path.abspath(currentdirectory + "/../..") # Get the parent directory(2 levels up) path = parentdirectory + '\Output Files\E5-Dimensionality Reduction/posture-'+str(posture)+'/TrainBlock-'+str(trainblock)+'' if not os.path.exists(path): os.makedirs(path) i_user = 1 block = 1 AUC = [] while i_user <= 31: while block <= 6: train_data = np.genfromtxt("../../Output Files/E3-Genuine Impostor data per user per posture/posture-"+str(posture)+"/User-"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(trainblock)+"-GI.csv", dtype=float, delimiter=",") test_data = np.genfromtxt("../../Output Files/E3-Genuine Impostor data per user per posture/posture-"+str(posture)+"/User-"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(block)+"-GI.csv", dtype=float, delimiter=",") target_train = np.ones(len(train_data)) row = 0 while row < len(train_data): if np.any(train_data[row, 0:3] != [1, i_user, posture]): target_train[row] = 0 row += 1 row = 0 target_test = np.ones(len(test_data)) while row < len(test_data): if np.any(test_data[row, 0:3] != [1, i_user, posture]): target_test[row] = 0 row += 1 sample_train = train_data[:, [3,4,5,6,7,9,11,12,13,14,15,16,17]] sample_test = test_data[:, [3,4,5,6,7,9,11,12,13,14,15,16,17]] scaler = preprocessing.MinMaxScaler().fit(sample_train) sample_train_scaled = scaler.transform(sample_train) sample_test_scaled = scaler.transform(sample_test) ica = FastICA(n_components=componenet, max_iter=150) sample_train_ica = ica.fit(sample_train_scaled).transform(sample_train_scaled) sample_test_ica = ica.transform(sample_test_scaled) clf = ExtraTreesClassifier(n_estimators=100) clf.fit(sample_train_ica, target_train) prediction = clf.predict(sample_test_ica) auc = metrics.roc_auc_score(target_test, prediction) AUC.append(auc) block += 1 block = 1 i_user += 1 print(AUC) AUC = np.array(AUC) AUC = AUC.reshape(31, 6) np.savetxt("../../Output Files/E5-Dimensionality Reduction/posture-"+str(posture)+"/TrainBlock-"+str(trainblock)+"/ICA-"+str(componenet)+"-Component.csv", AUC, delimiter=",")
def main(mode): path = '/local/attale00/AFLW_ALL/' path_ea = '/local/attale00/AFLW_cropped/mouth_img_error/' # fileNames = utils.getAllFiles(path_ea); labs=utils.parseLabelFiles(path+'/labels/labels','mouth_opening',fileNames,cutoffSeq='.png',suffix='_face0.labels') testSet = fg.dataContainer(labs) components = 150 roi=None X=fg.getAllImagesFlat(path_ea,testSet.fileNames,(40,120),roi=roi) # X=fg.getAllImagesFlat(path_ea,testSet.fileNames,(120,40),roi=roi,resizeFactor = .5) # # perform ICA if mode not in ['s','v']: ica = FastICA(n_components=components,whiten=True) ica.fit(X) meanI=np.mean(X,axis=0) X1=X-meanI data=ica.transform(X1) filters=ica.components_ elif mode in ['s','v']: W=np.load('/home/attale00/Desktop/classifiers/patches/filterMP1.npy') m=np.load('/home/attale00/Desktop/classifiers/patches/meanIMP1.npy') X1=X-m data=np.dot(X1,W.T) for i in range(len(fileNames)): testSet.data[i].extend(data[i,:]) print 'feature vector length: {}'.format(len(testSet.data[0])) testSet.targetNum=map(utils.mapMouthLabels2Two,testSet.target) rf=classifierUtils.standardRF(max_features = np.sqrt(len(testSet.data[0])),min_split=13,max_depth=40) #rf = svm.NuSVC() #rf = linear_model.SGDClassifier(loss='perceptron', eta0=1, learning_rate='constant', penalty=None) if mode in ['s','v']: print 'Classifying with loaded classifier' _classifyWithOld(path,testSet,mode) elif mode in ['c']: print 'cross validation of data' rValues = classifierUtils.dissectedCV(rf,testSet) pickle.dump(rValues,open('errorpatch_ica','w')) elif mode in ['save']: print 'saving new classifier' _saveRF(testSet,rf,filters=filters,meanI=meanI) else: print 'not doing anything'
def test_fit_transform(): """Test FastICA.fit_transform""" rng = np.random.RandomState(0) X = rng.random_sample((100, 10)) for whiten, n_components in [[True, 5], [False, 10]]: ica = FastICA(n_components=5, whiten=whiten, random_state=0) Xt = ica.fit_transform(X) assert_equal(ica.components_.shape, (n_components, 10)) assert_equal(Xt.shape, (100, n_components)) ica = FastICA(n_components=5, whiten=whiten, random_state=0) ica.fit(X) assert_equal(ica.components_.shape, (n_components, 10)) Xt2 = ica.transform(X) assert_array_almost_equal(Xt, Xt2)
def fastICA(X): from sklearn.decomposition import FastICA # FastICAのライブラリ n, p = X.shape M = np.mean(X, axis=0) M_est = M X2 = X - M decomposer = FastICA(n_components=p) decomposer.fit(X2) A_est = decomposer.mixing_ W_est = np.linalg.inv(A_est) S_est = decomposer.transform(X2) return S_est, W_est, M_est
def test_fit_transform(): # Test FastICA.fit_transform rng = np.random.RandomState(0) X = rng.random_sample((100, 10)) for whiten, n_components in [[True, 5], [False, None]]: n_components_ = (n_components if n_components is not None else X.shape[1]) ica = FastICA(n_components=n_components, whiten=whiten, random_state=0) Xt = ica.fit_transform(X) assert_equal(ica.components_.shape, (n_components_, 10)) assert_equal(Xt.shape, (100, n_components_)) ica = FastICA(n_components=n_components, whiten=whiten, random_state=0) ica.fit(X) assert_equal(ica.components_.shape, (n_components_, 10)) Xt2 = ica.transform(X) assert_array_almost_equal(Xt, Xt2)
def compute_PCA_ICA_NMF(n_components=5): spec_mean = spectra.mean(0) # PCA: use randomized PCA for speed pca = RandomizedPCA(n_components - 1) pca.fit(spectra) pca_comp = np.vstack([spec_mean, pca.components_]) # ICA treats sequential observations as related. Because of this, we need # to fit with the transpose of the spectra ica = FastICA(n_components - 1) ica.fit(spectra.T) ica_comp = np.vstack([spec_mean, ica.transform(spectra.T).T]) # NMF requires all elements of the input to be greater than zero spectra[spectra < 0] = 0 nmf = NMF(n_components) nmf.fit(spectra) nmf_comp = nmf.components_ return pca_comp, ica_comp, nmf_comp
print('//===========================pca==========================') pca = PCA(n) traindata_pca = pca.fit_transform(traindata) testdata_pca = pca.transform(testdata) Faceidentifier(traindata_pca,trainlabel,testdata_pca,testlabel) print('//===========================sfa==========================') sfa = sfa.SFA() traindata_sfa = sfa.fit_transform(traindata.T,conponents =n).T testdata_sfa = sfa.transform(testdata.T).T Faceidentifier(traindata_sfa,trainlabel,testdata_sfa,testlabel) print('//===========================fastica==========================') fastica = FastICA(n) traindata_fastica = fastica.fit_transform(traindata) testdata_fastica = fastica.transform(testdata) Faceidentifier(traindata_fastica,trainlabel,testdata_fastica,testlabel) for i in range(0,9): if i == 0: b = 0.1 elif i == 1: b = 0.2 elif i == 2: b = 0.5 elif i == 3: b = 0.8 elif i == 4: b = 1 elif i == 5: b = 2
'data': lda_transformed_data } for i in range(2, 9): estimators['untransformed_{}'.format(i)] = { 'est': KMeans(n_clusters=i), 'clusters': i, 'data': data } pca_transformed_data = pca.transform(data) for i in range(2, 9): estimators['pca_transformed_{}'.format(i)] = { 'est': KMeans(n_clusters=i), 'clusters': i, 'data': pca_transformed_data } ica_transformed_data = ica.transform(data) print 'kurt', kurtosis(ica_transformed_data) with open('outputs/ica_transformed_{}.csv'.format(SAMPLE_SIZE), 'w') as ica_data: writer = csv.writer(ica_data) for x, temp_y in zip(ica_transformed_data, y): x = list(x) x.append(temp_y) writer.writerow(x) for i in range(2, 9): estimators['ica_transformed_{}'.format(i)] = { 'est': KMeans(n_clusters=i), 'clusters': i, 'data': ica_transformed_data } for i in range(8): print 'Random Proj'
testSet = fg.dataContainer(labs) roi=(0,37,0,115) X=fg.getAllImagesFlat(path_ea,testSet.fileNames,(37,115),roi=roi) # # perform ICA ica = FastICA(n_components=100,whiten=True) ica.fit(X) meanI=np.mean(X,axis=0) X1=X-meanI data=ica.transform(X1) filters=ica.components_ for i in range(len(fileNames)): testSet.data[i].extend(data[i,:]) testSet.targetNum=map(utils.mapMouthLabels2Two,testSet.target) ############################################################################### # define a pipeline combining a text feature extractor with a simple # classifier clf = RandomForestClassifier() parameters = {'n_estimators': range(10, 40,20), 'max_depth': range(5, 40,5),
# select 1000 random epochs from data random.seed(3) train_eeg_matrix = np.vstack(train_data[random.sample(range(n_train_epochs), 2500), :, :56]) # Compute ICA ica = FastICA(n_components=train_eeg_matrix.shape[1], random_state=9) # train on part of the data ica.fit(train_eeg_matrix) del train_eeg_matrix log('ICA computed') # 2d matrix with all training data we have data_matrix = np.vstack(train_data[:, :, :]) train_data = ica.transform(data_matrix[:, :56]) # transform channels to sources data train_data = np.concatenate((train_data, data_matrix[:, 56:]), 1) # append additional features train_data = np.array_split(train_data, n_train_epochs) # split to epochs del data_matrix log('train source data retrieved') test_data, _ = load_data(folder_name, 'test') test_data = np.array(get_windows(test_data, window_start, window_size)) n_test_epochs = test_data.shape[0] # 2d matrix with all test data we have data_matrix = np.vstack(test_data[:, :, :]) test_data = ica.transform(data_matrix[:, :56]) # transform channels to sources data test_data = np.concatenate((test_data, data_matrix[:, 56:]), 1) # append additional features
def ica(self, whiten = True): ica = FastICA(n_components = 5, whiten = whiten) ica.fit(self.train) self.train = ica.transform(self.train) self.test = ica.transform(self.test)
def task1c(wine): data = pd.read_csv("winequality-" + wine + ".csv", sep=';') X = data.drop('quality', axis=1) y = data['quality'] scaler = StandardScaler() X = scaler.fit_transform(X) pca = PCA() pca.fit(X) pca_data = pca.transform(X) per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1) labels = ['label' + str(x) for x in range(1, len(per_var) + 1)] plt.bar(x=range(1, 12), height=per_var, tick_label=labels) plt.ylabel('Percentage of Explained Variance') plt.xlabel('Principal Component') plt.title(wine + ' wine PCA Variance') plt.show() pca_df = pd.DataFrame(pca_data, columns=labels) plt.scatter(pca_df.label1, pca_df.label2, edgecolors='black') plt.xlabel('label1 - {0}%'.format(per_var[0])) plt.ylabel('label2 - {0}%'.format(per_var[1])) plt.title(wine + ' wine PCA label1 and label2') plt.show() plt.scatter(pca_df.label1, pca_df.label3, edgecolors='black') plt.xlabel('label1 - {0}%'.format(per_var[0])) plt.ylabel('label3 - {0}%'.format(per_var[2])) plt.title(wine + ' wine PCA label1 and label3') plt.show() plt.scatter(pca_df.label2, pca_df.label3, edgecolors='black') plt.xlabel('label2 - {0}%'.format(per_var[1])) plt.ylabel('label3 - {0}%'.format(per_var[2])) plt.title(wine + ' wine PCA label2 and label3') plt.show() print("correlation between label1 and quality: " + str(pca_df.label1.corr(y))) print("correlation between label2 and quality: " + str(pca_df.label2.corr(y))) print("correlation between label3 and quality: " + str(pca_df.label3.corr(y))) plt.clf() ica = FastICA(n_components=3) ica.fit(X) ica_data = ica.transform(X) labels = ['label' + str(x) for x in range(1, 4)] ica_df = pd.DataFrame(ica_data, columns=labels) plt.scatter(ica_df.label1, ica_df.label2, edgecolors='black') plt.xlabel('label1') plt.ylabel('label2') plt.title(wine + ' wine ICA label1 and label2') plt.show() plt.scatter(ica_df.label1, ica_df.label3, edgecolors='black') plt.xlabel('label1') plt.ylabel('label3') plt.title(wine + ' wine ICA label1 and label3') plt.show() plt.scatter(ica_df.label2, ica_df.label3, edgecolors='black') plt.xlabel('label2') plt.ylabel('label3') plt.title(wine + ' wine ICA label2 and label3') plt.show() print("correlation between label1 and quality: " + str(ica_df.label1.corr(y))) print("correlation between label2 and quality: " + str(ica_df.label2.corr(y))) print("correlation between label3 and quality: " + str(ica_df.label3.corr(y))) plt.clf() X = data.drop('quality', axis=1) y = data['quality'] nmf = NMF(n_components=3, max_iter=10000) nmf.fit(X) nmf_data = nmf.transform(X) nmf_df = pd.DataFrame(nmf_data, columns=labels) plt.scatter(nmf_df.label1, nmf_df.label2, edgecolors='black') plt.xlabel('label1') plt.ylabel('label2') plt.title(wine + ' wine NMF label1 and label2') plt.show() plt.scatter(nmf_df.label1, nmf_df.label3, edgecolors='black') plt.xlabel('label1') plt.ylabel('label3') plt.title(wine + ' wine NMF label1 and label3') plt.show() plt.scatter(nmf_df.label2, nmf_df.label3, edgecolors='black') plt.xlabel('label2') plt.ylabel('label3') plt.title(wine + ' wine NMF label2 and label3') plt.show() print("correlation between label1 and quality: " + str(nmf_df.label1.corr(y))) print("correlation between label2 and quality: " + str(nmf_df.label2.corr(y))) print("correlation between label3 and quality: " + str(nmf_df.label3.corr(y))) print("end")
# shape print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape)) from sklearn.decomposition import PCA, FastICA n_comp = 10 # PCA pca = PCA(n_components=n_comp, random_state=42) pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1)) pca2_results_test = pca.transform(test) # ICA ica = FastICA(n_components=n_comp, random_state=42) ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1)) ica2_results_test = ica.transform(test) # Append decomposition components to datasets for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] y_train = train["y"] y_mean = np.mean(y_train) ############################################### modeling ######################################### ''' model1=XGBRegressor(n_estimators=500,max_depth=4)
class spectral_data(object): def __init__(self, df): try: uppercols = df.columns.levels[0] lowercols = list(df.columns.levels[1].values) except: df.columns = pd.MultiIndex.from_tuples(list(df.columns)) uppercols = df.columns.levels[0] lowercols = list(df.columns.levels[1].values) for i, val in enumerate(lowercols): try: lowercols[i] = float(val) except: lowercols[i] = val levels = [uppercols, lowercols] df.columns.set_levels(levels, inplace=True) self.df = df def interp(self, xnew): xnew = np.array(xnew, dtype='float') metadata_cols = self.df.columns.levels[0] != 'wvl' metadata = self.df[self.df.columns.levels[0][metadata_cols]] old_wvls = np.array(self.df['wvl'].columns, dtype='float') old_spectra = np.array(self.df['wvl']) new_spectra = np.empty([len(old_spectra[:, 0]), len(xnew)]) * np.nan interp_index = (xnew > min(old_wvls)) & (xnew < max(old_wvls)) f = sp.interpolate.interp1d(old_wvls, old_spectra, axis=1) new_spectra[:, interp_index] = f(xnew[interp_index]) xnew = list(xnew) for i, x in enumerate(xnew): xnew[i] = ('wvl', x) new_df = pd.DataFrame(new_spectra, columns=pd.MultiIndex.from_tuples(xnew), index=self.df.index) new_df = pd.concat([new_df, metadata], axis=1) self.df = new_df def cal_tran(self, refdata, matchcol_ref, matchcol_transform, method, methodparams): C_matrix = [] col = np.array([j.upper() for j in self.df[('meta', matchcol_transform)]]) col_ref = np.array([j.upper() for j in refdata[('meta', matchcol_ref)]]) for i in col: matches = np.where(col_ref == i, 1, 0) C_matrix.append(matches) C_matrix = np.transpose(np.array(C_matrix)) if method == 'LRA - Low Rank Alignment': refdata_trans, transdata_trans = LRA(np.array(refdata['wvl']), np.array(self.df['wvl']), C_matrix, methodparams['d']) refdata_trans = pd.DataFrame(refdata_trans) transdata_trans = pd.DataFrame(transdata_trans) pass if method == 'PDS Piecewise Direct Standardization': print('PDS not implemented yet!!') pass # This function masks out specified ranges of the data def mask(self, maskfile, maskvar='wvl'): df_spectra = self.df[maskvar] # extract just the spectra from the data frame metadata_cols = self.df.columns.levels[0] != maskvar # extract just the metadata metadata = self.df[self.df.columns.levels[0][metadata_cols]] mask = pd.read_csv(maskfile, sep=',') # read the mask file tmp = [] for i in mask.index: tmp.append((np.array(self.df[maskvar].columns, dtype='float') >= mask.ix[i, 'min_wvl']) & ( np.array(self.df[maskvar].columns, dtype='float') <= mask.ix[i, 'max_wvl'])) # combine the indexes for each range in the mask file into a single masking vector and use that to mask the spectra masked = np.any(np.array(tmp), axis=0) spectcols = list(df_spectra.columns) # get the list of columns in the spectra dataframe for i, j in enumerate(masked): # change the first level of the tuple from 'wvl' to 'masked' where appropriate if j == True: spectcols[i] = ('masked', spectcols[i]) else: spectcols[i] = (maskvar, spectcols[i]) df_spectra.columns = pd.MultiIndex.from_tuples( spectcols) # assign the multiindex columns based on the new tuples self.df = pd.concat([df_spectra, metadata], axis=1) # merge the masked spectra back with the metadata def multiply_vector(self, vectorfile): df_spectra = self.df['wvl'] # TODO: check to make sure wavelengths match before multiplying vector = np.array(pd.read_csv(vectorfile, sep=',', header=None))[:, 1] if df_spectra.shape[1] == vector.shape[0]: self.df['wvl'] = df_spectra.multiply(vector, axis=1) else: print('Vector is not the same size as the spectra!') def peak_area(self, peaks_mins_file=None): df = self.df # create a copy of the data wvls = df['wvl'].columns.values # get the wavelengths if peaks_mins_file is not None: peaks_mins = pd.read_csv(peaks_mins_file, sep=',') peaks = peaks_mins['peaks'] mins = peaks_mins['mins'] pass else: ave_spect = np.average(np.array(df['wvl']), axis=0) # find the average of the spectra in the data frame peaks = wvls[ sp.signal.argrelextrema(ave_spect, np.greater_equal)[0]] # find the maxima in the average spectrum mins = wvls[sp.signal.argrelextrema(ave_spect, np.less_equal)[0]] # find the maxima in the average spectrum wvls = df['wvl'].columns.values # get the wavelengths spectra = np.array(df['wvl']) for i in range(len(peaks)): # get the wavelengths between two minima try: low = mins[np.where(mins < peaks[i])[0][-1]] except: low = mins[0] try: high = mins[np.where(mins > peaks[i])[0][0]] except: high = mins[-1] peak_indices = np.all((wvls > low, wvls < high), axis=0) # plot.plot(wvls,ave_spect) # plot.plot(wvls[peak_indices],ave_spect[peak_indices]) # plot.show() df[('peak_area', peaks[i])] = spectra[:, peak_indices].sum(axis=1) self.df = df return peaks, mins # This function divides the data up into a specified number of random folds def random_folds(self, nfolds=5, seed=10, groupby=None): self.df[('meta', 'Folds')] = np.nan # Create an entry in the data frame that holds the folds foldslist = np.array(self.df[('meta', 'Folds')]) if groupby == None: # if no column name is listed to group on, just create random folds n = len(self.df.index) folds = cross_validation.KFold(n, nfolds, shuffle=True, random_state=seed) i = 1 for train, test in folds: foldslist[test] = i i = i + 1 else: # if a column name is provided, get all the unique values and define folds # so that all rows of a given value fall in the same fold # (this is useful to ensure that training and test data are truly independent) unique_inds = np.unique(self.df[groupby]) folds = cross_validation.KFold(len(unique_inds), nfolds, shuffle=True, random_state=seed) foldslist = np.array(self.df[('meta', 'Folds')]) i = 1 for train, test in folds: tmp = unique_inds[test] tmp_full_list = np.array(self.df[groupby]) tmp_ind = np.in1d(tmp_full_list, tmp) foldslist[tmp_ind] = i i = i + 1 self.df[('meta', 'Folds')] = foldslist # this function divides the data up into a specified number of folds, using sorting # To try to get folds that look similar to each other def stratified_folds(self, nfolds=5, sortby=None): self.df[('meta', 'Folds')] = np.NaN # Create an entry in the data frame that holds the folds self.df.sort_values(by=sortby, inplace=True) # sort the data frame by the column of interest uniqvals = np.unique(self.df[sortby]) # get the unique values from the column of interest # assign folds by stepping through the unique values fold_num = 1 for i in uniqvals: ind = self.df[sortby] == i # find where the data frame matches the unique value self.df.set_value(self.df.index[ind], ('meta', 'Folds'), fold_num) # Inrement the fold number, reset to 1 if it is greater than the desired number of folds fold_num = fold_num + 1 if fold_num > nfolds: fold_num = 1 # sort by index to return the df to its original order self.df.sort_index(inplace=True) self.folds_hist(sortby,50) def folds_hist(self, col_to_plot, nbins, xlabel='wt.%', ylabel='# of spectra'): folds_uniq = np.unique(self.df[('meta', 'Folds')]) for f in folds_uniq: temp = self.rows_match(('meta', 'Folds'), [f]) vals = np.array(temp.df[col_to_plot]) bins = np.linspace(0, np.max(vals), nbins) plot.hist(vals, linewidth=0.5, edgecolor='k') plot.xlabel(xlabel) plot.ylabel(ylabel) plot.title(str(col_to_plot[1]) + '- Fold ' + str(f)) fig = plot.gcf() fig.savefig('hist_fold_' + str(f) + '_' + col_to_plot[1] + '.png') plot.close() # This function normalizes specified ranges of the data by their respective sums def norm(self, ranges, col_var='wvl'): df_tonorm = self.df[col_var] top_level_cols = self.df.columns.levels[0] top_level_cols = top_level_cols[top_level_cols != col_var] df_other = self.df[top_level_cols] cols = df_tonorm.columns.values df_sub_norm = [] allind = [] for i in ranges: # Find the indices for the range ind = (np.array(cols, dtype='float') >= i[0]) & (np.array(cols, dtype='float') <= i[1]) # find the columns for the range normcols = cols[ind] # keep track of the indices used for all ranges allind.append(ind) # normalize over the current range df_sub_norm.append(norm_total(df_tonorm[normcols])) # collapse the list of indices used to a single array allind = np.sum(allind, axis=0) # identify columns that were not used by where the allind array is less than 1 cols_excluded = cols[np.where(allind < 1)] # create a separate data frame containing the un-normalized columns df_masked = df_tonorm[cols_excluded] # combine the normalized data frames into one df_norm = pd.concat(df_sub_norm, axis=1) # make the columns into multiindex df_masked.columns = [['masked'] * len(df_masked.columns), df_masked.columns] df_norm.columns = [[col_var] * len(df_norm.columns), df_norm.columns.values] # combine the normalized data frames, the excluded columns, and the metadata into a single data frame df_new = pd.concat([df_other, df_norm, df_masked], axis=1) self.df = df_new # This function applies baseline removal to the data def remove_baseline(self, method='ALS', segment=True, params=None): wvls = np.array(self.df['wvl'].columns.values, dtype='float') spectra = np.array(self.df['wvl'], dtype='float') # set baseline removal object (br) to the specified method if method == 'ALS': br = ALS() elif method == 'Dietrich': br = Dietrich() elif method == 'Polyfit': br = PolyFit() elif method == 'AirPLS': br = AirPLS() elif method == 'FABC': br = FABC() elif method == 'KK': br = KK() elif method == 'Mario': br = Mario() elif method == 'Median': br = MedianFilter() elif method == 'Rubberband': br = Rubberband() elif method == 'CCAM': br = ccam_br() # if method == 'wavelet': # br=Wavelet() else: print(method + ' is not recognized!') # if parameters are provided, use them to set the parameters of br if params is not None: for i in params.keys(): try: setattr(br, i, params[i]) except: print('Required keys are:') print(br.__dict__.keys()) print('Exiting without removing baseline!') return br.fit(wvls, spectra, segment=segment) self.df_baseline = self.df.copy() self.df_baseline['wvl'] = br.baseline self.df['wvl'] = self.df['wvl']-self.df_baseline['wvl'] # This function finds rows of the data frame where a specified column has # values matching a specified set of values # (Useful for extracting folds) def rows_match(self, column_name, isin_array, invert=False): if invert: new_df = self.df.loc[-self.df[column_name].isin(isin_array)] else: new_df = self.df.loc[self.df[column_name].isin(isin_array)] return spectral_data(new_df) # This function takes the sum of data over two specified wavelength ranges, # calculates the ratio of the sums, and adds the ratio as a column in the data frame def ratio(self, range1, range2, rationame=''): cols = self.df['wvl'].columns.values cols1 = cols[(cols >= range1[0]) & (cols <= range1[1])] cols2 = cols[(cols >= range2[0]) * (cols <= range2[1])] df1 = self.df['wvl'].loc[:, cols1] df2 = self.df['wvl'].loc[:, cols2] sum1 = df1.sum(axis=1) sum2 = df2.sum(axis=1) ratio = sum1 / sum2 self.df[('ratio', rationame)] = ratio def standard_scale(self, col): self.df[col] = StandardScaler().fit_transform(self.df[col]) def deriv(self): new_df=self.df.copy() wvls=self.df['wvl'].columns.values new_df['wvl'] = self.df['wvl'].diff(axis=1)/wvls foo=new_df['wvl'].columns.values new_df=new_df.drop(('wvl',self.df['wvl'].columns.values[0]),axis=1) foo2=new_df['wvl'].columns.values return spectral_data(new_df) def dim_red(self, col, method, params, kws, load_fit=None): if method == 'PCA': self.do_dim_red = PCA(*params, **kws) if method == 'FastICA': self.do_dim_red = FastICA(*params, **kws) if method == 't-SNE': self.do_dim_red = TSNE(*params, **kws) if method == 'LLE': self.do_dim_red = LocallyLinearEmbedding(*params, **kws) if method == 'JADE-ICA': self.do_dim_red = JADE(*params, **kws) # TODO: Add ICA-JADE here if load_fit: self.do_dim_red = load_fit else: if method != 't-SNE': self.do_dim_red.fit(self.df[col]) dim_red_result = self.do_dim_red.transform(self.df[col]) else: dim_red_result = self.do_dim_red.fit_transform(self.df[col]) for i in list(range(1, dim_red_result.shape[1] + 1)): # will need to revisit this for other methods that don't use n_components to make sure column names still mamke sense self.df[(method, str(i))] = dim_red_result[:, i - 1] return self.do_dim_red def outlier_removal(self, col, method, params): if method == 'Isolation Forest': self.do_outlier_removal = IsolationForest(**params) else: method == None self.do_outlier_removal.fit(np.array(self.df[col])) outlier_scores = self.do_outlier_removal.decision_function(np.array(self.df[col])) self.df[('meta','Outlier Scores - '+method+str(params))] = outlier_scores #is_outlier = self.do_outlier_removal.predict(np.array(self.df[col])) #self.df[('meta', 'Outliers - ' + method + str(params))] = is_outlier return self.do_outlier_removal def pca(self, col, nc=None, load_fit=None): if nc: self.do_pca = PCA(n_components=nc) self.do_pca.fit(self.df[col]) if load_fit: # use this to load a previous fit rather than fit the current data self.do_pca = load_fit pca_result = self.do_pca.transform(self.df[col]) for i in list(range(1, self.do_pca.n_components + 1)): self.df[('PCA', i)] = pca_result[:, i - 1] def ica(self, col, nc=None, load_fit=None): if nc: self.do_ica = FastICA(n_components=nc) self.do_ica.fit(self.df[col]) if load_fit: # use this to load a previous fit rather than fit the current data self.do_ica = load_fit ica_result = self.do_ica.transform(self.df[col]) for i in list(range(1, self.do_ica.n_components + 1)): self.df[('ICA', i)] = ica_result[:, i - 1] def ica_jade(self, col, nc=None, load_fit=None, corrcols=None): if load_fit is not None: # use this to load a previous fit rather than fit the current data scores = np.dot(load_fit, self.df[col]) else: scores = jade(self.df[col].values, m=nc, verbose=False) loadings = np.dot(scores, self.df[col]) icacols = [] for i in list(range(1, len(scores[:, 0]) + 1)): if np.abs(np.max(loadings[i - 1, :])) < np.abs( np.min(loadings[i - 1, :])): # flip the sign if necessary to look nicer loadings[i - 1, :] = loadings[i - 1, :] * -1 scores[i - 1, :] = scores[i - 1, :] * -1 icacols.append(('ICA-JADE', i)) self.df[('ICA-JADE', i)] = scores[i - 1, :].T self.ica_jade_loadings = loadings if corrcols: combined_cols = corrcols + icacols corrdf = self.df[combined_cols].corr().drop(icacols, 1).drop(corrcols, 0) ica_jade_ids = [] for i in corrdf.loc['ICA-JADE'].index: tmp = corrdf.loc[('ICA-JADE', i)] match = tmp.values == np.max(tmp) ica_jade_ids.append(corrcols[np.where(match)[0]][1] + ' (r=' + str(np.round(np.max(tmp), 1)) + ')') pass self.ica_jade_corr = corrdf self.ica_jade_ids = ica_jade_ids def col_within_range(self, rangevals, col): mask = (self.df[('meta', col)] > rangevals[0]) & (self.df[('meta', col)] < rangevals[1]) return self.df.loc[mask] def enumerate_duplicates(self, col): rows = self.df[('meta', col)] rows = rows.fillna('-') rows = [str(x) for x in rows] unique_rows = np.unique(rows) rows=np.array(rows) rows_list=list(rows) for i in unique_rows: if i is not '-': matchindex = np.where(rows == i)[0] if len(matchindex) > 1: for n, name in enumerate(rows[matchindex]): rows_list[matchindex[n]] = i+ ' - ' + str(n + 1) self.df[('meta', col)] = rows_list
def ica_original_25_components(): filename = ("nba_original_ica_transformed_25d_matrix.npy") ica = FastICA(n_components=37, algorithm='deflation', max_iter=100) ica.fit(players_stat) transformed_data = ica.transform(players_stat) np.save(filename,transformed_data)
plt.xlabel('Number of components') plt.ylabel('accuracy') plt.legend(['LR', 'LDA', 'GNB', 'Linear SVM', 'rbf SVM'], loc='lower right') plt.grid(True) if (0): # ICA from sklearn.decomposition import FastICA nComponents = np.arange(1, nClasses + 1 + 50) icaScores = np.zeros((5, np.alen(nComponents))) for i, n in enumerate(nComponents): icaT = FastICA(n_components=n, max_iter=10000) icaT.fit(Xtrain, labelsTrain) XtrainT = icaT.transform(Xtrain) XtestT = icaT.transform(Xtest) icaScores[:, i] = util.classify(XtrainT, XtestT, labelsTrain, labelsTest) ica = FastICA(n_components=3, max_iter=10000) ica.fit(Xtrain, labelsTrain) xt = ica.transform(Xtrain) fig = plt.figure() util.plotData(fig, xt[:, :3], labelsTrain, classColors) plt.title('First 3 components of projected data') #%% Plot accuracies for ICA plt.figure() for i in range(5): plt.plot(nComponents, icaScores[i, :], lw=3)
y_pred = clf.predict(X_test_pca) accuracies.append(float(np.sum(y_test == y_pred)) / len(y_pred)) components.append(n_components) print('For ' + str(n_components) + ' components, accuracy is ' + str(float(np.sum(y_test == y_pred)) / len(y_pred)) + ' confusion matrix is: ') # print(confusion_matrix(y_test, y_pred, labels=range(n_classes))) # print(classification_report(y_test, y_pred, target_names=target_names)) ############# ICA ica = FastICA(n_components=n_components) S_ = ica.fit_transform(X) A_ = ica.mixing_ X_train_ica = ica.transform(X_train) X_test_ica = ica.transform(X_test) param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = clf.fit(X_train_ica, y_train) y_pred = clf.predict(X_test_ica) accuracies_ica.append(float(np.sum(y_test == y_pred)) / len(y_pred)) components_ica.append(n_components) print('For ' + str(n_components) + ' components, accuracy is ' + str(float(np.sum(y_test == y_pred)) / len(y_pred)) +
def get_dc_feature(df_train, df_test, n_comp=12, id_column=None, label_column=None): """ 构造分解特征 """ train = df_train.copy() test = df_test.copy() if id_column: train_id = train[id_column] test_id = test[id_column] train = drop_columns(train, [id_column]) test = drop_columns(test, [id_column]) if label_column: train_y = train[label_column] train = drop_columns(train, [label_column]) # tSVD tsvd = TruncatedSVD(n_components=n_comp, random_state=420) tsvd_results_train = tsvd.fit_transform(train) tsvd_results_test = tsvd.transform(test) # PCA pca = PCA(n_components=n_comp, random_state=420) pca2_results_train = pca.fit_transform(train) pca2_results_test = pca.transform(test) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(train) ica2_results_test = ica.transform(test) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(train) grp_results_test = grp.transform(test) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train) srp_results_test = srp.transform(test) # Append decomposition components to datasets for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] if id_column: train[id_column] = train_id test[id_column] = test_id if label_column: train[label_column] = train_y return train, test
if labels[i] == predictions[i]: true += 1 else: false += 1 return (true / (false + true)) * 100 if __name__ == '__main__': train_images = transform_images(train_path) test_images = transform_images(test_path) x = [] y = [] for i in range(50, 500, 10): ica = FastICA(n_components=i, whiten=True) train = ica.fit(train_images[0]).transform(train_images[0]) test = ica.transform(test_images[0]) clsf = svm.SVC() clsf.fit(train, train_images[1]) predvals = clsf.predict(test) x.append(i) y.append(benchmark(test_images[1], predvals)) plt.plot(x, y) plt.xlabel('Number of independent components') plt.ylabel('Performance in %') plt.show()
(numDay - 2) * duration), ] Y_inner_trainingset = Y_arr[subtrain[train], ].reshape( (numDay - 2) * duration) X_validate = X_all[X_arr[subtrain[validate], ], ].reshape( duration, 600) Y_validate = Y_all[X_arr[subtrain[validate]]].reshape(duration) for C in components: print(test, subtrain[validate], subtrain[train], C) ica = FastICA(n_components=C, max_iter=5000, tol=0.0001) #tol = 0.001 X_inner_train = ica.fit_transform( X_inner_trainingset ) #pull components from ica fit transformation X_inner_test = ica.transform(X_validate) clf = svm.SVC(kernel='linear', class_weight='balanced', probability=True) y_inner_score = clf.fit( X_inner_train, Y_inner_trainingset).decision_function(X_inner_test) fpr, tpr, _ = roc_curve(Y_validate, y_inner_score) roc_auc[t, v, c] = auc(fpr, tpr) c += 1 v += 1 best[t, 0] = int(np.argmax(np.mean(roc_auc[t, :, :], axis=0))) print('Best components | %0.0f' % (components[int(best[t, 0])]))
def refineregressor( fmridata, fmritr, shiftedtcs, weights, passnum, lagstrengths, lagtimes, lagsigma, lagmask, R2, theprefilter, optiondict, padtrs=60, bipolar=False, includemask=None, excludemask=None, debug=False, rt_floatset=np.float64, rt_floattype="float64", ): """ Parameters ---------- fmridata : 4D numpy float array fMRI data fmritr : float Data repetition rate, in seconds shiftedtcs : 4D numpy float array Time aligned voxel timecourses weights : unknown unknown passnum : int Number of the pass (for labelling output) lagstrengths : 3D numpy float array Maximum correlation coefficient in every voxel lagtimes : 3D numpy float array Time delay of maximum crosscorrelation in seconds lagsigma : 3D numpy float array Gaussian width of the crosscorrelation peak, in seconds. lagmask : 3D numpy float array Mask of voxels with successful correlation fits. R2 : 3D numpy float array Square of the maximum correlation coefficient in every voxel theprefilter : function The filter function to use optiondict : dict Dictionary of all internal rapidtide configuration variables. padtrs : int, optional Number of timepoints to pad onto each end includemask : 3D array Mask of voxels to include in refinement. Default is None (all voxels). excludemask : 3D array Mask of voxels to exclude from refinement. Default is None (no voxels). debug : bool Enable additional debugging output. Default is False rt_floatset : function Function to coerce variable types rt_floattype : {'float32', 'float64'} Data type for internal variables Returns ------- volumetotal : int Number of voxels processed outputdata : float array New regressor maskarray : 3D array Mask of voxels used for refinement """ inputshape = np.shape(fmridata) if optiondict["ampthresh"] < 0.0: if bipolar: theampthresh = tide_stats.getfracval(np.fabs(lagstrengths), -optiondict["ampthresh"], nozero=True) else: theampthresh = tide_stats.getfracval(lagstrengths, -optiondict["ampthresh"], nozero=True) print( "setting ampthresh to the", -100.0 * optiondict["ampthresh"], "th percentile (", theampthresh, ")", ) else: theampthresh = optiondict["ampthresh"] if bipolar: ampmask = np.where( np.fabs(lagstrengths) >= theampthresh, np.int16(1), np.int16(0)) else: ampmask = np.where(lagstrengths >= theampthresh, np.int16(1), np.int16(0)) if optiondict["lagmaskside"] == "upper": delaymask = np.where( (lagtimes - optiondict["offsettime"]) > optiondict["lagminthresh"], np.int16(1), np.int16(0), ) * np.where( (lagtimes - optiondict["offsettime"]) < optiondict["lagmaxthresh"], np.int16(1), np.int16(0), ) elif optiondict["lagmaskside"] == "lower": delaymask = np.where( (lagtimes - optiondict["offsettime"]) < -optiondict["lagminthresh"], np.int16(1), np.int16(0), ) * np.where( (lagtimes - optiondict["offsettime"]) > -optiondict["lagmaxthresh"], np.int16(1), np.int16(0), ) else: abslag = abs(lagtimes) - optiondict["offsettime"] delaymask = np.where(abslag > optiondict["lagminthresh"], np.int16(1), np.int16(0)) * np.where( abslag < optiondict["lagmaxthresh"], np.int16(1), np.int16(0)) sigmamask = np.where(lagsigma < optiondict["sigmathresh"], np.int16(1), np.int16(0)) locationmask = lagmask + 0 if includemask is not None: locationmask = locationmask * includemask if excludemask is not None: locationmask = locationmask * (1 - excludemask) locationmask = locationmask.astype(np.int16) print("location mask created") # first generate the refine mask locationfails = np.sum(1 - locationmask) ampfails = np.sum(1 - ampmask * locationmask) lagfails = np.sum(1 - delaymask * locationmask) sigmafails = np.sum(1 - sigmamask * locationmask) refinemask = locationmask * ampmask * delaymask * sigmamask if tide_stats.getmasksize(refinemask) == 0: print("ERROR: no voxels in the refine mask:") print( "\n ", locationfails, " locationfails", "\n ", ampfails, " ampfails", "\n ", lagfails, " lagfails", "\n ", sigmafails, " sigmafails", ) if (includemask is None) and (excludemask is None): print("\nRelax ampthresh, delaythresh, or sigmathresh - exiting") else: print( "\nChange include/exclude masks or relax ampthresh, delaythresh, or sigmathresh - exiting" ) return 0, None, None, locationfails, ampfails, lagfails, sigmafails if optiondict["cleanrefined"]: shiftmask = locationmask else: shiftmask = refinemask volumetotal = np.sum(shiftmask) reportstep = 1000 # timeshift the valid voxels if optiondict["nprocs"] > 1: # define the consumer function here so it inherits most of the arguments def timeshift_consumer(inQ, outQ): while True: try: # get a new message val = inQ.get() # this is the 'TERM' signal if val is None: break # process and send the data outQ.put( _procOneVoxelTimeShift( val, fmridata[val, :], lagstrengths[val], R2[val], lagtimes[val], padtrs, fmritr, theprefilter, optiondict["fmrifreq"], refineprenorm=optiondict["refineprenorm"], lagmaxthresh=optiondict["lagmaxthresh"], refineweighting=optiondict["refineweighting"], detrendorder=optiondict["detrendorder"], offsettime=optiondict["offsettime"], filterbeforePCA=optiondict["filterbeforePCA"], psdfilter=optiondict["psdfilter"], rt_floatset=rt_floatset, rt_floattype=rt_floattype, )) except Exception as e: print("error!", e) break data_out = tide_multiproc.run_multiproc( timeshift_consumer, inputshape, shiftmask, nprocs=optiondict["nprocs"], showprogressbar=True, chunksize=optiondict["mp_chunksize"], ) # unpack the data psdlist = [] for voxel in data_out: shiftedtcs[voxel[0], :] = voxel[1] weights[voxel[0], :] = voxel[2] if optiondict["psdfilter"]: psdlist.append(voxel[3]) del data_out else: psdlist = [] for vox in range(0, inputshape[0]): if (vox % reportstep == 0 or vox == inputshape[0] - 1) and optiondict["showprogressbar"]: tide_util.progressbar(vox + 1, inputshape[0], label="Percent complete (timeshifting)") if shiftmask[vox] > 0.5: retvals = _procOneVoxelTimeShift( vox, fmridata[vox, :], lagstrengths[vox], R2[vox], lagtimes[vox], padtrs, fmritr, theprefilter, optiondict["fmrifreq"], refineprenorm=optiondict["refineprenorm"], lagmaxthresh=optiondict["lagmaxthresh"], refineweighting=optiondict["refineweighting"], detrendorder=optiondict["detrendorder"], offsettime=optiondict["offsettime"], filterbeforePCA=optiondict["filterbeforePCA"], psdfilter=optiondict["psdfilter"], rt_floatset=rt_floatset, rt_floattype=rt_floattype, ) shiftedtcs[retvals[0], :] = retvals[1] weights[retvals[0], :] = retvals[2] if optiondict["psdfilter"]: psdlist.append(retvals[3]) print() if optiondict["psdfilter"]: print(len(psdlist)) print(psdlist[0]) print(np.shape(np.asarray(psdlist, dtype=rt_floattype))) averagepsd = np.mean(np.asarray(psdlist, dtype=rt_floattype), axis=0) stdpsd = np.std(np.asarray(psdlist, dtype=rt_floattype), axis=0) snr = np.nan_to_num(averagepsd / stdpsd) # now generate the refined timecourse(s) validlist = np.where(refinemask > 0)[0] refinevoxels = shiftedtcs[validlist, :] if bipolar: for thevoxel in range(len(validlist)): if lagstrengths[validlist][thevoxel] < 0.0: refinevoxels[thevoxel, :] *= -1.0 refineweights = weights[validlist] weightsum = np.sum(refineweights, axis=0) / volumetotal averagedata = np.sum(refinevoxels, axis=0) / volumetotal if optiondict["cleanrefined"]: invalidlist = np.where((1 - ampmask) > 0)[0] discardvoxels = shiftedtcs[invalidlist] discardweights = weights[invalidlist] discardweightsum = np.sum(discardweights, axis=0) / volumetotal averagediscard = np.sum(discardvoxels, axis=0) / volumetotal if optiondict["dodispersioncalc"]: print("splitting regressors by time lag for phase delay estimation") laglist = np.arange( optiondict["dispersioncalc_lower"], optiondict["dispersioncalc_upper"], optiondict["dispersioncalc_step"], ) dispersioncalcout = np.zeros((np.shape(laglist)[0], inputshape[1]), dtype=rt_floattype) fftlen = int(inputshape[1] // 2) fftlen -= fftlen % 2 dispersioncalcspecmag = np.zeros((np.shape(laglist)[0], fftlen), dtype=rt_floattype) dispersioncalcspecphase = np.zeros((np.shape(laglist)[0], fftlen), dtype=rt_floattype) for lagnum in range(0, np.shape(laglist)[0]): lower = laglist[lagnum] - optiondict["dispersioncalc_step"] / 2.0 upper = laglist[lagnum] + optiondict["dispersioncalc_step"] / 2.0 inlagrange = np.where( locationmask * ampmask * np.where(lower < lagtimes, np.int16(1), np.int16(0)) * np.where(lagtimes < upper, np.int16(1), np.int16(0)))[0] print( " summing", np.shape(inlagrange)[0], "regressors with lags from", lower, "to", upper, ) if np.shape(inlagrange)[0] > 0: dispersioncalcout[lagnum, :] = tide_math.corrnormalize( np.mean(shiftedtcs[inlagrange], axis=0), detrendorder=optiondict["detrendorder"], windowfunc=optiondict["windowfunc"], ) ( freqs, dispersioncalcspecmag[lagnum, :], dispersioncalcspecphase[lagnum, :], ) = tide_math.polarfft(dispersioncalcout[lagnum, :], 1.0 / fmritr) inlagrange = None tide_io.writenpvecs( dispersioncalcout, optiondict["outputname"] + "_dispersioncalcvecs_pass" + str(passnum) + ".txt", ) tide_io.writenpvecs( dispersioncalcspecmag, optiondict["outputname"] + "_dispersioncalcspecmag_pass" + str(passnum) + ".txt", ) tide_io.writenpvecs( dispersioncalcspecphase, optiondict["outputname"] + "_dispersioncalcspecphase_pass" + str(passnum) + ".txt", ) tide_io.writenpvecs( freqs, optiondict["outputname"] + "_dispersioncalcfreqs_pass" + str(passnum) + ".txt", ) if optiondict["pcacomponents"] < 0.0: pcacomponents = "mle" elif optiondict["pcacomponents"] >= 1.0: pcacomponents = int(np.round(optiondict["pcacomponents"])) elif optiondict["pcacomponents"] == 0.0: print("0.0 is not an allowed value for pcacomponents") sys.exit() else: pcacomponents = optiondict["pcacomponents"] icacomponents = 1 if optiondict["refinetype"] == "ica": print("performing ica refinement") thefit = FastICA(n_components=icacomponents).fit( refinevoxels) # Reconstruct signals print("Using first of ", len(thefit.components_), " components") icadata = thefit.components_[0] filteredavg = tide_math.corrnormalize( theprefilter.apply(optiondict["fmrifreq"], averagedata), detrendorder=optiondict["detrendorder"], ) filteredica = tide_math.corrnormalize( theprefilter.apply(optiondict["fmrifreq"], icadata), detrendorder=optiondict["detrendorder"], ) thepxcorr = pearsonr(filteredavg, filteredica)[0] print("ica/avg correlation = ", thepxcorr) if thepxcorr > 0.0: outputdata = 1.0 * icadata else: outputdata = -1.0 * icadata elif optiondict["refinetype"] == "pca": # use the method of "A novel perspective to calibrate temporal delays in cerebrovascular reactivity # using hypercapnic and hyperoxic respiratory challenges". NeuroImage 187, 154?165 (2019). print("performing pca refinement with pcacomponents set to", pcacomponents) try: thefit = PCA(n_components=pcacomponents).fit(refinevoxels) except ValueError: if pcacomponents == "mle": print( "mle estimation failed - falling back to pcacomponents=0.8" ) thefit = PCA(n_components=0.8).fit(refinevoxels) else: print("unhandled math exception in PCA refinement - exiting") sys.exit() print( "Using ", len(thefit.components_), " component(s), accounting for ", "{:.2f}% of the variance".format(100.0 * np.cumsum( thefit.explained_variance_ratio_)[len(thefit.components_) - 1]), ) reduceddata = thefit.inverse_transform(thefit.transform(refinevoxels)) if debug: print("complex processing: reduceddata.shape =", reduceddata.shape) pcadata = np.mean(reduceddata, axis=0) filteredavg = tide_math.corrnormalize( theprefilter.apply(optiondict["fmrifreq"], averagedata), detrendorder=optiondict["detrendorder"], ) filteredpca = tide_math.corrnormalize( theprefilter.apply(optiondict["fmrifreq"], pcadata), detrendorder=optiondict["detrendorder"], ) thepxcorr = pearsonr(filteredavg, filteredpca)[0] print("pca/avg correlation = ", thepxcorr) if thepxcorr > 0.0: outputdata = 1.0 * pcadata else: outputdata = -1.0 * pcadata elif optiondict["refinetype"] == "weighted_average": print("performing weighted averaging refinement") outputdata = np.nan_to_num(averagedata / weightsum) else: print("performing unweighted averaging refinement") outputdata = averagedata if optiondict["cleanrefined"]: thefit, R = tide_fit.mlregress(averagediscard, averagedata) fitcoff = rt_floatset(thefit[0, 1]) datatoremove = rt_floatset(fitcoff * averagediscard) outputdata -= datatoremove print() print( "Timeshift applied to " + str(int(volumetotal)) + " voxels, " + str(len(validlist)) + " used for refinement:", "\n ", locationfails, " locationfails", "\n ", ampfails, " ampfails", "\n ", lagfails, " lagfails", "\n ", sigmafails, " sigmafails", ) if optiondict["psdfilter"]: outputdata = tide_filt.transferfuncfilt(outputdata, snr) # garbage collect collected = gc.collect() print("Garbage collector: collected %d objects." % collected) return volumetotal, outputdata, refinemask, locationfails, ampfails, lagfails, sigmafails
def perform_feature_engineering(train, test, config): for c in train.columns: if (len(train[c].value_counts()) == 2): if (train[c].mean() < config['SparseThreshold']): del train[c] del test[c] col = list(test.columns) if config['ID'] != True: col.remove('ID') # tSVD if (config['tSVD'] == True): tsvd = TruncatedSVD(n_components=config['n_comp']) tsvd_results_train = tsvd.fit_transform(train[col]) tsvd_results_test = tsvd.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] # PCA if (config['PCA'] == True): pca = PCA(n_components=config['n_comp']) pca2_results_train = pca.fit_transform(train[col]) pca2_results_test = pca.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] # ICA if (config['ICA'] == True): ica = FastICA(n_components=config['n_comp']) ica2_results_train = ica.fit_transform(train[col]) ica2_results_test = ica.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] # GRP if (config['GRP'] == True): grp = GaussianRandomProjection(n_components=config['n_comp'], eps=0.1) grp_results_train = grp.fit_transform(train[col]) grp_results_test = grp.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['grp_' + str(i)] = grp_results_train[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] # SRP if (config['SRP'] == True): srp = SparseRandomProjection(n_components=config['n_comp'], dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train[col]) srp_results_test = srp.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['srp_' + str(i)] = srp_results_train[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] if config['magic'] == True: magic_mat = train[['ID', 'X0', 'y']] magic_mat = magic_mat.groupby(['X0'])['y'].mean() magic_mat = pd.DataFrame({ 'X0': magic_mat.index, 'magic': list(magic_mat) }) mean_magic = magic_mat['magic'].mean() train = train.merge(magic_mat, on='X0', how='left') test = test.merge(magic_mat, on='X0', how='left') test['magic'] = test['magic'].fillna(mean_magic) return train, test
tempDF = pd.DataFrame(data=xDF.loc[:,0:1], index=xDF.index) tempDF = pd.concat((tempDF,yDF), axis=1, join="inner") tempDF.columns = ["First Vector", "Second Vector", "Label"] sns.lmplot(x="First Vector", y="Second Vector", hue="Label", \ data=tempDF, fit_reg=False) ax = plt.gca() ax.set_title("Separation of Observations using "+algoName) #---------------------------------------------------------------------------------------------------- # Independent Component Analysis from sklearn.decomposition import FastICA n_components = 25 algorithm = 'parallel' whiten = True max_iter = 100 random_state = 2018 fastICA = FastICA(n_components=n_components, algorithm=algorithm, \ whiten=whiten, max_iter=max_iter, random_state=random_state) X_train_fastICA = fastICA.fit_transform(X_train) X_train_fastICA = pd.DataFrame(data=X_train_fastICA, index=train_index) X_validation_fastICA = fastICA.transform(X_validation) X_validation_fastICA = pd.DataFrame(data=X_validation_fastICA, \ index=validation_index) scatterPlot(X_train_fastICA, y_train, "Independent Component Analysis") plt.show()
n_observations=N + Ntest, n_components_in_mixture=n_components_in_mixture, n_sources=n_sources, n_features=n_features, **cifa_param) for data_generating_model in data_generating_models: for deviation in deviations: for dataset in range(n_datasets): data, reference = sess.run( [data_tf[data_generating_model], reference_tf], feed_dict={placeholder_deviation: deviation}) if initial_direction.lower() == 'ica': init_directions = fica.fit(data).mixing_.T kmeans_cluster_centers = kmeans.fit( fica.transform(data[:N])).cluster_centers_ elif initial_direction.lower() == 'pca': init_directions = pca.fit(data).components_.T kmeans_cluster_centers = kmeans.fit(pca.transform( data[:N])).cluster_centers_ else: init_directions = np.random.randn( n_sources, n_features).astype('float64') kmeans_cluster_centers = kmeans.fit(data[:N].dot( init_directions.T)).cluster_centers_ init_directions = init_directions / np.linalg.norm( init_directions, axis=1, keepdims=True) current_data_variance = data[:N].var()
def test_fastica_simple(add_noise, seed): # Test the FastICA algorithm on very simple data. rng = np.random.RandomState(seed) # scipy.stats uses the global RNG: n_samples = 1000 # Generate two sources: s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 s2 = stats.t.rvs(1, size=n_samples) s = np.c_[s1, s2].T center_and_norm(s) s1, s2 = s # Mixing angle phi = 0.6 mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) m = np.dot(mixing, s) if add_noise: m += 0.1 * rng.randn(2, 1000) center_and_norm(m) # function as fun arg def g_test(x): return x**3, (3 * x**2).mean(axis=-1) algos = ['parallel', 'deflation'] nls = ['logcosh', 'exp', 'cube', g_test] whitening = [True, False] for algo, nl, whiten in itertools.product(algos, nls, whitening): if whiten: k_, mixing_, s_ = fastica(m.T, fun=nl, algorithm=algo, random_state=rng) assert_raises(ValueError, fastica, m.T, fun=np.tanh, algorithm=algo) else: pca = PCA(n_components=2, whiten=True, random_state=rng) X = pca.fit_transform(m.T) k_, mixing_, s_ = fastica(X, fun=nl, algorithm=algo, whiten=False, random_state=rng) assert_raises(ValueError, fastica, X, fun=np.tanh, algorithm=algo) s_ = s_.T # Check that the mixing model described in the docstring holds: if whiten: assert_almost_equal(s_, np.dot(np.dot(mixing_, k_), m)) center_and_norm(s_) s1_, s2_ = s_ # Check to see if the sources have been estimated # in the wrong order if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)): s2_, s1_ = s_ s1_ *= np.sign(np.dot(s1_, s1)) s2_ *= np.sign(np.dot(s2_, s2)) # Check that we have estimated the original sources if not add_noise: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2) else: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1) # Test FastICA class _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=seed) ica = FastICA(fun=nl, algorithm=algo, random_state=seed) sources = ica.fit_transform(m.T) assert_equal(ica.components_.shape, (2, 2)) assert_equal(sources.shape, (1000, 2)) assert_array_almost_equal(sources_fun, sources) assert_array_almost_equal(sources, ica.transform(m.T)) assert_equal(ica.mixing_.shape, (2, 2)) for fn in [np.tanh, "exp(-.5(x^2))"]: ica = FastICA(fun=fn, algorithm=algo) assert_raises(ValueError, ica.fit, m.T) assert_raises(TypeError, FastICA(fun=range(10)).fit, m.T)
line2, = plt.plot(k_arr, kurt_var, color='b', marker='o', label='variance of kurtosis') plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) plt.ylabel(' kurtosis') plt.xlabel('Number of components') plt.show() return None kurt(X, y, 20) ica = FastICA(n_components=11, random_state=0) ica_2d = ica.fit_transform(X) X_ica = ica.transform(X) plt.scatter(ica_2d[:, 0], ica_2d[:, 1], c=y, cmap="RdGy", edgecolor="None", alpha=1, vmin=75, vmax=150) plt.colorbar() plt.title('ICA Scatter Plot') def plot_samples(S, axis_list=None): plt.scatter(S[:, 0], S[:, 1],
def DecomposedFeatures(train, test, total, addtrain, addtest, n_components, use_pca=0.0, use_tsvd=0.0, use_ica=0.0, use_fa=0.0, use_grp=0.0, use_srp=0.0): N_COMP = int(n_components * train.shape[1]) + 1 print("\nStart decomposition process...") train_decomposed = np.concatenate([addtrain], axis=1) test_decomposed = np.concatenate([addtest], axis=1) if use_pca > 0.0: print("PCA") N_COMP = int(use_pca * train.shape[1]) + 1 pca = PCA(n_components=N_COMP, whiten=True, svd_solver="full", random_state=42) pca_results = pca.fit(total) pca_results_train = pca.transform(train) pca_results_test = pca.transform(test) train_decomposed = np.concatenate( [pca_results_train, train_decomposed], axis=1) test_decomposed = np.concatenate([pca_results_test, test_decomposed], axis=1) if use_tsvd > 0.0: print("tSVD") N_COMP = int(use_tsvd * train.shape[1]) + 1 tsvd = TruncatedSVD(n_components=N_COMP, random_state=42) tsvd_results = tsvd.fit(total) tsvd_results_train = tsvd.transform(train) tsvd_results_test = tsvd.transform(test) train_decomposed = np.concatenate( [tsvd_results_train, train_decomposed], axis=1) test_decomposed = np.concatenate([tsvd_results_test, test_decomposed], axis=1) if use_ica > 0.0: print("ICA") N_COMP = int(use_ica * train.shape[1]) + 1 ica = FastICA(n_components=N_COMP, random_state=42) ica_results = ica.fit(total) ica_results_train = ica.transform(train) ica_results_test = ica.transform(test) train_decomposed = np.concatenate( [ica_results_train, train_decomposed], axis=1) test_decomposed = np.concatenate([ica_results_test, test_decomposed], axis=1) if use_fa > 0.0: print("FA") N_COMP = int(use_fa * train.shape[1]) + 1 fa = FactorAnalysis(n_components=N_COMP, random_state=42) fa_results = fa.fit(total) fa_results_train = fa.transform(train) fa_results_test = fa.transform(test) train_decomposed = np.concatenate([fa_results_train, train_decomposed], axis=1) test_decomposed = np.concatenate([fa_results_test, test_decomposed], axis=1) if use_grp > 0.0: print("GRP") N_COMP = int(use_grp * train.shape[1]) + 1 grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=42) grp_results = grp.fit(total) grp_results_train = grp.transform(train) grp_results_test = grp.transform(test) train_decomposed = np.concatenate( [grp_results_train, train_decomposed], axis=1) test_decomposed = np.concatenate([grp_results_test, test_decomposed], axis=1) if use_srp > 0.0: print("SRP") N_COMP = int(use_srp * train.shape[1]) + 1 srp = SparseRandomProjection(n_components=N_COMP, dense_output=True, random_state=42) srp_results = srp.fit(total) srp_results_train = srp.transform(train) srp_results_test = srp.transform(test) train_decomposed = np.concatenate( [srp_results_train, train_decomposed], axis=1) test_decomposed = np.concatenate([srp_results_test, test_decomposed], axis=1) print("Append decomposition components together...") train_with_only_decomposed_features = pd.DataFrame(train_decomposed) test_with_only_decomposed_features = pd.DataFrame(test_decomposed) #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']: # train_with_only_decomposed_features[col] = train[col] # test_with_only_decomposed_features[col] = test[col] # Remove any NA train_with_only_decomposed_features = train_with_only_decomposed_features.fillna( 0) test_with_only_decomposed_features = test_with_only_decomposed_features.fillna( 0) return train_with_only_decomposed_features, test_with_only_decomposed_features
class VAE_trainer(): def __init__(self, dim_z=20, device="cuda"): # prepare cuda device self.device = torch.device( device if torch.cuda.is_available() else "cpu") #self.device = torch.device("cpu") # prepare dataset self.dataset = SoundDataset(transform=transforms.ToTensor(), mode='score') # define model self.model = VAE(self.dataset.data_size, dim_z).to(self.device) # define optimizer self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3) self.dim_z = dim_z def load(self, key): self.dataset.load_npz('../data/sounds/raw/' + key + '.npz') self.dataset.normalize() def train(self, epoch, max_epoch): # train mode self.model.train() train_loss = 0 train_loss_vae = 0 train_loss_classifier = 0 train_acc = 0 for batch_idx, (x, y) in enumerate(self.train_loader): x, y = x.to(self.device), y.to(self.device) # zero the parameter gradients self.optimizer.zero_grad() # forward rec_x, pre_y, mu, logvar = self.model(x) loss_vae = self.model.loss_function_vae(rec_x, x, mu, logvar) loss_classifier = self.model.loss_function_classifier(pre_y, y) loss = loss_vae + loss_classifier # backward loss.backward() # update the parameter self.optimizer.step() # logging train_loss += loss.item() train_loss_vae += loss_vae.item() train_loss_classifier += loss_classifier.item() train_acc += self.model.acc(pre_y, y) if batch_idx % 20 == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(x), len(self.train_loader.dataset), 100. * batch_idx / len(self.train_loader), loss.item() / len(x))) train_loss /= len(self.train_loader.dataset) train_loss_vae /= len(self.train_loader.dataset) train_loss_classifier /= len(self.train_loader.dataset) train_acc /= len(self.train_loader.dataset) print('====> Epoch: {} Average loss: {:.4f}'.format(epoch, train_loss)) return train_loss, train_loss_vae, train_loss_classifier, train_acc def valid(self, epoch): # test mode self.model.eval() valid_loss = 0 valid_loss_vae = 0 valid_loss_classifier = 0 valid_acc = 0 # test mode with torch.no_grad(): for i, (x, y) in enumerate(self.valid_loader): x, y = x.to(self.device), y.to(self.device) rec_x, pre_y, mu, logvar = self.model.valid(x) loss_vae = self.model.loss_function_vae(rec_x, x, mu, logvar) loss_classifier = self.model.loss_function_classifier(pre_y, y) loss = loss_vae + loss_classifier valid_loss += loss.item() valid_loss_vae += loss_vae.item() valid_loss_classifier += loss_classifier.item() valid_acc += self.model.acc(pre_y, y) valid_loss /= len(self.valid_loader.dataset) valid_loss_vae /= len(self.valid_loader.dataset) valid_loss_classifier /= len(self.valid_loader.dataset) valid_acc /= len(self.valid_loader.dataset) print('====> Validation set loss: {:.4f}'.format(valid_loss)) return valid_loss, valid_loss_vae, valid_loss_classifier, valid_acc def auto_train(self, max_epoch, save_path=None): train_set, valid_set = torch.utils.data.random_split( self.dataset, [ int(len(self.dataset) * 0.8), len(self.dataset) - int(len(self.dataset) * 0.8) ]) self.train_loader = torch.utils.data.DataLoader(train_set, batch_size=10, shuffle=True) self.valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=10, shuffle=True) train_loss = [] train_loss_vae = [] train_loss_classifier = [] train_acc = [] valid_loss = [] valid_loss_vae = [] valid_loss_classifier = [] valid_acc = [] for epoch in range(1, max_epoch): t_loss, t_loss_vae, t_loss_classifier, t_acc = self.train( epoch, max_epoch) v_loss, v_loss_vae, v_loss_classifier, v_acc = self.valid(epoch) train_loss.append(t_loss) train_loss_vae.append(t_loss_vae) train_loss_classifier.append(t_loss_classifier) train_acc.append(t_acc) valid_loss.append(v_loss) valid_loss_vae.append(v_loss_vae) valid_loss_classifier.append(v_loss_classifier) valid_acc.append(v_acc) # plot result if save_path is not None: fig, ax = plt.subplots(4, 1, figsize=(8, 16)) ax[0].set_title('Loss') ax[1].set_title('VAE Loss') ax[2].set_title('Classifier Loss') ax[3].set_title('Accuracy') for i in range(3): ax[i].set_xlabel('Epochs') ax[i].set_ylabel('Loss') ax[0].plot(range(1, max_epoch), train_loss, label="train") ax[0].plot(range(1, max_epoch), valid_loss, label="validation") ax[1].plot(range(1, max_epoch), train_loss_vae, label="train") ax[1].plot(range(1, max_epoch), valid_loss_vae, label="validation") ax[2].plot(range(1, max_epoch), train_loss_classifier, label="train") ax[2].plot(range(1, max_epoch), valid_loss_classifier, label="validation") ax[3].set_xlabel('Epochs') ax[3].set_ylabel('Accuracy') ax[3].plot(range(1, max_epoch), train_acc, label="train") ax[3].plot(range(1, max_epoch), valid_acc, label="validation") for i in range(3): ax[i].legend() plt.tight_layout() plt.savefig(save_path + '/loss.png') plt.close() def save_weight(self, save_path='../result/VAE-score/model/vae'): torch.save(self.model.state_dict(), save_path) def load_weight(self, load_path='../result/VAE-score/model/vae'): self.model.load_state_dict(torch.load(load_path)) def plot_z(self, save_path='../result/VAE-score/model/result.png'): # print z all data loader = torch.utils.data.DataLoader(self.dataset, batch_size=len(self.dataset), shuffle=False) all_z = [] all_ans = [] self.model.eval() with torch.no_grad(): for i, (data, ans) in enumerate(loader): data = data.to(self.device) _, _, mu, logvar = self.model.forward(data) all_z = np.append(all_z, mu.to('cpu').clone().numpy()) all_z = np.array(all_z).reshape(-1, self.model.z_shape) all_ans = self.dataset.ans # LDA #self.lda = LDA(n_components = 2) #self.lda.fit(all_z, all_ans) #lda_z = self.lda.transform(all_z) #lda_z = lda_z.transpose() #z_xrange = [np.min(lda_z[0]), np.max(lda_z[0])] #z_yrange = [np.min(lda_z[1]), np.max(lda_z[1])] #plot_z(lda_z[0], lda_z[1], all_ans, "z map", save_path.split('.png')[0] + '_LDA.png', z_xrange, z_yrange) #plot_z_each(lda_z, all_ans, self.dataset.filenames, '../data/succeed_list_sound.csv', "z map", # save_path.split('.png')[0] + '_LDA_each.png', z_xrange, z_yrange) # ICA self.ica = FastICA(n_components=2) self.ica.fit(all_z) ica_z = self.ica.transform(all_z) ica_z = ica_z.transpose() z_xrange = [np.min(ica_z[0]), np.max(ica_z[0])] z_yrange = [np.min(ica_z[1]), np.max(ica_z[1])] plot_z(ica_z[0], ica_z[1], all_ans, "z map", save_path.split('.png')[0] + '_ICA.png', z_xrange, z_yrange) plot_z_each(ica_z, all_ans, self.dataset.filenames, '../data/succeed_list_sound.csv', "z map", save_path.split('.png')[0] + '_ICA_each.png', z_xrange, z_yrange) return all_z, all_ans, ica_z.transpose() def reconstruct(self, save_path='../result/VAE-score/reconstructed_sounds'): loader = torch.utils.data.DataLoader(self.dataset, batch_size=1, shuffle=False) self.model.eval() with torch.no_grad(): for i, (x, y) in enumerate(loader): x = x.to(self.device) recon_x, _, _, _ = self.model.forward(x) recon_x = recon_x.to('cpu').clone().numpy() x = x.to('cpu').clone().numpy() x = x.reshape(3, -1) recon_x = recon_x.reshape(3, -1) # to png fig, ax = plt.subplots(2, 3, figsize=(24, 12)) ax[0][0].set_title('L') ax[0][1].set_title('C') ax[0][2].set_title('R') ax[1][0].set_title('reconstructed L') ax[1][1].set_title('reconstructed C') ax[1][2].set_title('reconstructed R') time = range(len(x[0])) for j in range(3): ax[0][j].set_ylim(0, 1) ax[1][j].set_ylim(0, 1) ax[0][j].plot(time, x[j], linewidth=1) ax[1][j].plot(time, recon_x[j], linewidth=1) plt.tight_layout() plt.savefig(save_path + '/' + self.dataset.filenames[i].split('.csv')[0] + '.png') plt.close() # to csv save_data = pd.DataFrame(data=recon_x) save_data.to_csv(save_path + '/' + self.dataset.filenames[i], index=False)
class ICA(object): """M/EEG signal decomposition using Independent Component Analysis (ICA) This object can be used to estimate ICA components and then remove some from Raw or Epochs for data exploration or artifact correction. Parameters ---------- n_components : int | float | None The number of components used for ICA decomposition. If int, it must be smaller then max_n_components. If None, all PCA components will be used. If float between 0 and 1 components can will be selected by the cumulative percentage of explained variance. max_n_components : int | None The number of components used for PCA decomposition. If None, no dimension reduction will be applied and max_n_components will equal the number of channels supplied on decomposing data. noise_cov : None | instance of mne.cov.Covariance Noise covariance used for whitening. If None, channels are just z-scored. random_state : None | int | instance of np.random.RandomState np.random.RandomState to initialize the FastICA estimation. As the estimation is non-deterministic it can be useful to fix the seed to have reproducible results. algorithm : {'parallel', 'deflation'} Apply parallel or deflational algorithm for FastICA fun : string or function, optional. Default: 'logcosh' The functional form of the G function used in the approximation to neg-entropy. Could be either 'logcosh', 'exp', or 'cube'. You can also provide your own function. It should return a tuple containing the value of the function, and of its derivative, in the point. fun_args: dictionary, optional Arguments to send to the functional form. If empty and if fun='logcosh', fun_args will take value {'alpha' : 1.0} verbose : bool, str, int, or None If not None, override default verbose level (see mne.verbose). Attributes ---------- last_fit : str Flag informing about which type was last fit. ch_names : list-like Channel names resulting from initial picking. n_components : int The number of components used for ICA decomposition. max_n_components : int The number of PCA dimensions computed. verbose : bool, str, int, or None See above. """ @verbose def __init__(self, n_components, max_n_components=100, noise_cov=None, random_state=None, algorithm='parallel', fun='logcosh', fun_args=None, verbose=None): try: from sklearn.decomposition import FastICA # to avoid strong dep. except ImportError: raise Exception('the scikit-learn package is missing and ' 'required for ICA') self.noise_cov = noise_cov # sklearn < 0.11 does not support random_state argument for FastICA kwargs = {'algorithm': algorithm, 'fun': fun, 'fun_args': fun_args} if random_state is not None: aspec = inspect.getargspec(FastICA.__init__) if 'random_state' not in aspec.args: warnings.warn('random_state argument ignored, update ' 'scikit-learn to version 0.11 or newer') else: kwargs['random_state'] = random_state if max_n_components is not None and n_components > max_n_components: raise ValueError('n_components must be smaller than ' 'max_n_components') if isinstance(n_components, float): if not 0 < n_components <= 1: raise ValueError('For selecting ICA components by the ' 'explained variance of PCA components the' ' float value must be between 0.0 and 1.0 ') self._explained_var = n_components logger.info('Selecting pca_components via explained variance.') else: self._explained_var = 1.1 logger.info('Selecting pca_components directly.') self._ica = FastICA(**kwargs) self.current_fit = 'unfitted' self.verbose = verbose self.n_components = n_components self.max_n_components = max_n_components self.ch_names = None self._mixing = None def __repr__(self): s = 'ICA ' if self.current_fit == 'unfitted': msg = '(no' elif self.current_fit == 'raw': msg = '(raw data' else: msg = '(epochs' msg += ' decomposition, ' s += msg + ('%s components' % str(self.n_components) if self.n_components else 'no dimension reduction') + ')' return s @verbose def decompose_raw(self, raw, picks=None, start=None, stop=None, verbose=None): """Run the ICA decomposition on raw data Parameters ---------- raw : instance of mne.fiff.Raw Raw measurements to be decomposed. picks : array-like Channels to be included. This selection remains throughout the initialized ICA session. If None only good data channels are used. start : int First sample to include (first is 0). If omitted, defaults to the first sample in data. stop : int First sample to not include. If omitted, data is included to the end. verbose : bool, str, int, or None If not None, override default verbose level (see mne.verbose). Defaults to self.verbose. Returns ------- self : instance of ICA Returns the modified instance. """ if self.current_fit != 'unfitted': raise RuntimeError('ICA decomposition has already been fitted. ' 'Please start a new ICA session.') logger.info('Computing signal decomposition on raw data. ' 'Please be patient, this may take some time') if picks is None: # just use good data channels picks = pick_types(raw.info, meg=True, eeg=True, eog=False, ecg=False, misc=False, stim=False, exclude=raw.info['bads']) if self.max_n_components is None: self.max_n_components = len(picks) logger.info('Inferring max_n_components from picks.') self.ch_names = [raw.ch_names[k] for k in picks] data, self._pre_whitener = self._pre_whiten(raw[picks, start:stop][0], raw.info, picks) to_ica, self._pca = self._prepare_pca(data, self.max_n_components) self._ica.fit(to_ica) self._mixing = self._ica.get_mixing_matrix().T self.current_fit = 'raw' return self @verbose def decompose_epochs(self, epochs, picks=None, verbose=None): """Run the ICA decomposition on epochs Parameters ---------- epochs : instance of Epochs The epochs. The ICA is estimated on the concatenated epochs. picks : array-like Channels to be included relative to the channels already picked on epochs-initialization. This selection remains throughout the initialized ICA session. verbose : bool, str, int, or None If not None, override default verbose level (see mne.verbose). Defaults to self.verbose. Returns ------- self : instance of ICA Returns the modified instance. """ if self.current_fit != 'unfitted': raise RuntimeError('ICA decomposition has already been fitted. ' 'Please start a new ICA session.') logger.info('Computing signal decomposition on epochs. ' 'Please be patient, this may take some time') if picks is None: # just use epochs good data channels and avoid picks = pick_types(epochs.info, include=epochs.ch_names, # double exclude=epochs.info['bads']) # picking meeg_picks = pick_types(epochs.info, meg=True, eeg=True, eog=False, ecg=False, misc=False, stim=False, exclude=epochs.info['bads']) # filter out all the channels the raw wouldn't have initialized picks = np.intersect1d(meeg_picks, picks) self.ch_names = [epochs.ch_names[k] for k in picks] if self.max_n_components is None: self.max_n_components = len(picks) logger.info('Inferring max_n_components from picks.') data, self._pre_whitener = self._pre_whiten( np.hstack(epochs.get_data()[:, picks]), epochs.info, picks) to_ica, self._pca = self._prepare_pca(data, self.max_n_components) self._ica.fit(to_ica) self._mixing = self._ica.get_mixing_matrix().T self.current_fit = 'epochs' return self def get_sources_raw(self, raw, start=None, stop=None): """Estimate raw sources given the unmixing matrix Parameters ---------- raw : instance of Raw Raw object to draw sources from. start : int First sample to include (first is 0). If omitted, defaults to the first sample in data. stop : int First sample to not include. If omitted, data is included to the end. Returns ------- sources : array, shape = (n_components, n_times) The ICA sources time series. """ if self._mixing is None: raise RuntimeError('No fit available. Please first fit ICA ' 'decomposition.') return self._get_sources_raw(raw, start, stop)[0] def _get_sources_raw(self, raw, start, stop): picks = [raw.ch_names.index(k) for k in self.ch_names] data, _ = self._pre_whiten(raw[picks, start:stop][0], raw.info, picks) pca_data = self._pca.transform(data.T) raw_sources = self._ica.transform(pca_data[:, self._comp_idx]).T return raw_sources, pca_data def get_sources_epochs(self, epochs, concatenate=False): """Estimate epochs sources given the unmixing matrix Parameters ---------- epochs : instance of Epochs Epochs object to draw sources from. concatenate : bool If true, epochs and time slices will be concatenated. Returns ------- epochs_sources : ndarray of shape (n_epochs, n_sources, n_times) The sources for each epoch """ if self._mixing is None: raise RuntimeError('No fit available. Please first fit ICA ' 'decomposition.') return self._get_sources_epochs(epochs, concatenate)[0] def _get_sources_epochs(self, epochs, concatenate): picks = pick_types(epochs.info, include=self.ch_names, exclude=epochs.info['bads']) # special case where epochs come picked but fit was 'unpicked'. if len(picks) != len(self.ch_names): raise RuntimeError('Epochs don\'t match fitted data: %i channels ' 'fitted but %i channels supplied. \nPlease ' 'provide Epochs compatible with ' 'ica.ch_names' % (len(self.ch_names), len(picks))) data, _ = self._pre_whiten(np.hstack(epochs.get_data()[:, picks]), epochs.info, picks) pca_data = self._pca.transform(data.T) sources = self._ica.transform(pca_data[:, self._comp_idx]).T sources = np.array(np.split(sources, len(epochs.events), 1)) if concatenate: sources = np.hstack(sources) return sources, pca_data def export_sources(self, raw, picks=None, start=None, stop=None): """Export sources as raw object Parameters ---------- raw : instance of Raw Raw object to export sources from. picks : array-like Channels to be included in addition to the sources. If None, artifact and stimulus channels will be included. start : int First sample to include (first is 0). If omitted, defaults to the first sample in data. stop : int First sample to not include. If omitted, data is included to the end. Returns ------- out : instance of mne.Raw Container object for ICA sources """ if not raw._preloaded: raise ValueError('raw data should be preloaded to have this ' 'working. Please read raw data with ' 'preload=True.') # include 'reference' channels for comparison with ICA if picks is None: picks = pick_types(raw.info, meg=False, eeg=False, misc=True, ecg=True, eog=True, stim=True) # merge copied instance and picked data with sources out = raw.copy() out.fids = [] sources = self.get_sources_raw(raw, start=start, stop=stop) out._data = np.r_[sources, raw[picks, start:stop][0]] # update first and last samples out.first_samp = raw.first_samp + (start if start else 0) out.last_samp = out.first_samp + stop if stop else raw.last_samp # set channel names and info ch_names = out.info['ch_names'] = [] ch_info = out.info['chs'] = [] for i in xrange(self.n_components): ch_names.append('ICA %03d' % (i + 1)) ch_info.append(dict(ch_name='ICA %03d' % (i + 1), cal=1, logno=i + 1, coil_type=FIFF.FIFFV_COIL_NONE, kind=FIFF.FIFFV_MISC_CH, coord_Frame=FIFF.FIFFV_COORD_UNKNOWN, loc=np.array([0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1.], dtype=np.float32), unit=FIFF.FIFF_UNIT_NONE, eeg_loc=None, range=1.0, scanno=i + 1, unit_mul=0, coil_trans=None)) # re-append additionally picked ch_names ch_names += [raw.ch_names[k] for k in picks] # re-append additionally picked ch_info ch_info += [raw.info['chs'][k] for k in picks] # update number of channels out.info['nchan'] = len(picks) + self.n_components return out def plot_sources_raw(self, raw, order=None, start=None, stop=None, n_components=None, source_idx=None, ncol=3, nrow=10, show=True): """Create panel plots of ICA sources. Wrapper around viz.plot_ica_panel Parameters ---------- raw : instance of mne.fiff.Raw Raw object to plot the sources from. order : ndarray | None. Index of length n_components. If None, plot will show the sources in the order as fitted. Example: arg_sort = np.argsort(np.var(sources)). start : int X-axis start index. If None from the beginning. stop : int X-axis stop index. If None to the end. n_components : int Number of components fitted. source_idx : array-like Indices for subsetting the sources. ncol : int Number of panel-columns. nrow : int Number of panel-rows. show : bool If True, plot will be shown, else just the figure is returned. Returns ------- fig : instance of pyplot.Figure """ sources = self.get_sources_raw(raw, start=start, stop=stop) if order is not None: if len(order) != sources.shape[0]: raise ValueError('order and sources have to be of the ' 'same length.') else: sources = sources[order] fig = plot_ica_panel(sources, start=0 if start is not None else start, stop=(stop - start) if stop is not None else stop, n_components=n_components, source_idx=source_idx, ncol=ncol, nrow=nrow) if show: import matplotlib.pylab as pl pl.show() return fig def plot_sources_epochs(self, epochs, epoch_idx=None, order=None, start=None, stop=None, n_components=None, source_idx=None, ncol=3, nrow=10, show=True): """Create panel plots of ICA sources. Wrapper around viz.plot_ica_panel Parameters ---------- epochs : instance of mne.Epochs Epochs object to plot the sources from. epoch_idx : int Index to plot particular epoch. order : ndarray | None. Index of length n_components. If None, plot will show the sources in the order as fitted. Example: arg_sort = np.argsort(np.var(sources)). sources : ndarray Sources as drawn from self.get_sources. start : int X-axis start index. If None from the beginning. stop : int X-axis stop index. If None to the end. n_components : int Number of components fitted. source_idx : array-like Indices for subsetting the sources. ncol : int Number of panel-columns. nrow : int Number of panel-rows. show : bool If True, plot will be shown, else just the figure is returned. Returns ------- fig : instance of pyplot.Figure """ sources = self.get_sources_epochs(epochs, concatenate=True if epoch_idx is None else False) source_dim = 1 if sources.ndim > 2 else 0 if order is not None: if len(order) != sources.shape[source_dim]: raise ValueError('order and sources have to be of the ' 'same length.') else: sources = (sources[:, order] if source_dim else sources[order]) fig = plot_ica_panel(sources[epoch_idx], start=start, stop=stop, n_components=n_components, source_idx=source_idx, ncol=ncol, nrow=nrow) if show: import matplotlib.pylab as pl pl.show() return fig def find_sources_raw(self, raw, target=None, score_func='pearsonr', start=None, stop=None): """Find sources based on own distribution or based on similarity to other sources or between source and target. Parameters ---------- raw : instance of Raw Raw object to draw sources from. target : array-like | ch_name | None Signal to which the sources shall be compared. It has to be of the same shape as the sources. If some string is supplied, a routine will try to find a matching channel. If None, a score function expecting only one input-array argument must be used, for instance, scipy.stats.skew (default). score_func : callable | str label Callable taking as arguments either two input arrays (e.g. pearson correlation) or one input array (e. g. skewness) and returns a float. For convenience the most common score_funcs are available via string labels: Currently, all distance metrics from scipy.spatial and all functions from scipy.stats taking compatible input arguments are supported. These function have been modified to support iteration over the rows of a 2D array. start : int First sample to include (first is 0). If omitted, defaults to the first sample in data. stop : int First sample to not include. If omitted, data is included to the end. scores : ndarray Scores for each source as returned from score_func. Returns ------- scores : ndarray scores for each source as returned from score_func """ # auto source drawing sources = self.get_sources_raw(raw=raw, start=start, stop=stop) # auto target selection if target is not None: if hasattr(target, 'ndim'): if target.ndim < 2: target = target.reshape(1, target.shape[-1]) if isinstance(target, str): pick = _get_target_ch(raw, target) target, _ = raw[pick, start:stop] if sources.shape[1] != target.shape[1]: raise ValueError('Source and targets do not have the same' 'number of time slices.') target = target.ravel() return _find_sources(sources, target, score_func) def find_sources_epochs(self, epochs, target=None, score_func='pearsonr'): """Find sources based on relations between source and target Parameters ---------- epochs : instance of Epochs Epochs object to draw sources from. target : array-like | ch_name | None Signal to which the sources shall be compared. It has to be of the same shape as the sources. If some string is supplied, a routine will try to find a matching channel. If None, a score function expecting only one input-array argument must be used, for instance, scipy.stats.skew (default). score_func : callable | str label Callable taking as arguments either two input arrays (e.g. pearson correlation) or one input array (e. g. skewness) and returns a float. For convenience the most common score_funcs are available via string labels: Currently, all distance metrics from scipy.spatial and all functions from scipy.stats taking compatible input arguments are supported. These function have been modified to support iteration over the rows of a 2D array. Returns ------- scores : ndarray scores for each source as returned from score_func """ sources = self.get_sources_epochs(epochs=epochs) # auto target selection if target is not None: if hasattr(target, 'ndim'): if target.ndim < 3: target = target.reshape(1, 1, target.shape[-1]) if isinstance(target, str): pick = _get_target_ch(epochs, target) target = epochs.get_data()[:, pick] if sources.shape[2] != target.shape[2]: raise ValueError('Source and targets do not have the same' 'number of time slices.') target = target.ravel() return _find_sources(np.hstack(sources), target, score_func) def pick_sources_raw(self, raw, include=None, exclude=None, n_pca_components=64, start=None, stop=None, copy=True): """Recompose raw data including or excluding some sources Parameters ---------- raw : instance of Raw Raw object to pick to remove ICA components from. include : list-like | None The source indices to use. If None all are used. exclude : list-like | None The source indices to remove. If None all are used. n_pca_components: The number of PCA components to be unwhitened, where n_components is the lower bound and max_n_components the upper bound. If greater than self.n_components, the PCA components that were not supplied to the ICA will get re-attached. This can be used to take back the PCA dimension reduction. start : int | None The first time index to include. stop : int | None The first time index to exclude. copy: bool modify raw instance in place or return modified copy. Returns ------- raw : instance of Raw raw instance with selected ICA components removed """ if not raw._preloaded: raise ValueError('raw data should be preloaded to have this ' 'working. Please read raw data with ' 'preload=True.') if self.current_fit != 'raw': raise ValueError('Currently no raw data fitted.' 'Please fit raw data first.') sources, pca_data = self._get_sources_raw(raw, start=start, stop=stop) recomposed = self._pick_sources(sources, pca_data, include, exclude, n_pca_components) if copy is True: raw = raw.copy() picks = [raw.ch_names.index(k) for k in self.ch_names] raw[picks, start:stop] = recomposed return raw def pick_sources_epochs(self, epochs, include=None, exclude=None, n_pca_components=64, copy=True): """Recompose epochs Parameters ---------- epochs : instance of Epochs Epochs object to pick to remove ICA components from. include : list-like | None The source indices to use. If None all are used. exclude : list-like | None The source indices to remove. If None all are used. n_pca_components: The number of PCA components to be unwhitened, where n_components is the lower bound and max_n_components the upper bound. If greater than self.n_components, the PCA components that were not supplied to the ICA will get re-attached. This can be used to take back the PCA dimension reduction. copy : bool Modify Epochs instance in place or return modified copy. Returns ------- epochs : instance of Epochs Epochs with selected ICA components removed. """ if not epochs.preload: raise ValueError('raw data should be preloaded to have this ' 'working. Please read raw data with ' 'preload=True.') sources, pca_data = self._get_sources_epochs(epochs, True) picks = pick_types(epochs.info, include=self.ch_names, exclude=epochs.info['bads']) if copy is True: epochs = epochs.copy() # put sources-dimension first for selection recomposed = self._pick_sources(sources, pca_data, include, exclude, n_pca_components) # restore epochs, channels, tsl order epochs._data[:, picks] = np.array(np.split(recomposed, len(epochs.events), 1)) epochs.preload = True return epochs def _pre_whiten(self, data, info, picks): """Helper function""" if self.noise_cov is None: # use standardization as whitener pre_whitener = np.std(data) ** -1 data *= pre_whitener else: # pick cov ncov = deepcopy(self.noise_cov) if ncov.ch_names != self.ch_names: ncov['data'] = ncov.data[picks][:, picks] assert data.shape[0] == ncov.data.shape[0] pre_whitener, _ = compute_whitener(ncov, info, picks) data = np.dot(pre_whitener, data) return data, pre_whitener def _prepare_pca(self, data, max_n_components): """ Helper Function """ from sklearn.decomposition import RandomizedPCA # sklearn < 0.11 does not support random_state argument kwargs = {'n_components': max_n_components, 'whiten': False} aspec = inspect.getargspec(RandomizedPCA.__init__) if 'random_state' not in aspec.args: warnings.warn('RandomizedPCA does not support random_state ' 'argument. Use scikit-learn to version 0.11 ' 'or newer to get reproducible results.') else: kwargs['random_state'] = 0 pca = RandomizedPCA(**kwargs) pca_data = pca.fit_transform(data.T) if self._explained_var > 1.0: if self.n_components is not None: # normal n case self._comp_idx = np.arange(self.n_components) to_ica = pca_data[:, self._comp_idx] else: # None case to_ica = pca_data self.n_components = pca_data.shape[1] self._comp_idx = np.arange(self.n_components) else: # float case expl_var = pca.explained_variance_ratio_ self._comp_idx = (np.where(expl_var.cumsum() < self._explained_var)[0]) to_ica = pca_data[:, self._comp_idx] self.n_components = len(self._comp_idx) return to_ica, pca def _pick_sources(self, sources, pca_data, include, exclude, n_pca_components): """Helper function""" if not(self.n_components <= n_pca_components <= self.max_n_components): raise ValueError('n_pca_components must be between n_components' ' and max_n_components.') if include not in (None, []): mute = [i for i in xrange(len(sources)) if i not in include] sources[mute, :] = 0. # include via exclusion elif exclude not in (None, []): sources[exclude, :] = 0. # just exclude # restore pca data mixing = self._mixing.copy() pca_restored = np.dot(sources.T, mixing) # re-append deselected pca dimension if desired if n_pca_components - self.n_components > 0: add_components = np.arange(self.n_components, n_pca_components) pca_reappend = pca_data[:, add_components] pca_restored = np.c_[pca_restored, pca_reappend] # restore sensor space data out = _inverse_t_pca(pca_restored, self._pca) # restore scaling pre_whitener = self._pre_whitener.copy() if self.noise_cov is None: # revert standardization pre_whitener **= -1 out *= pre_whitener else: out = np.dot(out, linalg.pinv(pre_whitener)) return out.T
plt.legend([b[0] for b in bars], cv_types) plt.show() X = load_wine().data y = load_wine().target X = np.array(X, dtype=np.float32) scaler = StandardScaler() scaler.fit(X) X = scaler.transform(X) ica = ICA(n_components=4) ica.fit(X) X = ica.transform(X) plot_bic(X) gmm = mixture.GaussianMixture(n_components=2, covariance_type='tied') gmm.fit(X) gaussianGroups = [[], []] for pt in X: res = gmm.predict(pt.reshape(1, -1))[0] gaussianGroups[res].append(pt) print gaussianGroups[1]
ax1.set_ylabel('Mean Cross Validation Accuracy') ax1.axvline(gridSearch.best_estimator_.named_steps['ica'].n_components, linestyle=':', label='n_components chosen', linewidth=2) plt.legend(prop=dict(size=12)) plt.title('Accuracy/kurtosis for ICA (best n_components= %d)' % gridSearch.best_estimator_.named_steps['ica'].n_components) plt.show() #Reducing the dimensions with optimal number of components ica_new = FastICA( n_components=gridSearch.best_estimator_.named_steps['ica'].n_components) ica_new.fit(X_train) X_train_transformed = ica_new.transform(X_train) X_test_transformed = ica_new.transform(X_test) ############################################################################################################################### #Reconstruction Error print("Calculating Reconstruction Error") reconstruction_error = [] for comp in n_components: ica = FastICA(n_components=comp) X_transformed = ica.fit_transform(X_train) X_projected = ica.inverse_transform(X_transformed) reconstruction_error.append(((X_train - X_projected)**2).mean())
def extract_staining (img, mask, expected, white_threshold=.15, verbose=False): """ Extract spatial distribution of the stain and pigment in the image Parameters ---------- img: array [height, width, 3] The RGB image to be decomposed. Range of values 0 - 255 mask: array [height, width] The mask showing the embryo location. expected: [(array [3], float), (array [3], float), (array [3], float)] List of pairs (RGB colour, confidence) for expected stain, pigment and background colour respectively. If any of the colours is not expected to be present corresponding RGB value should be set to white ([255, 255, 255]) white_threshold: float, optional, default: .15 Maximal length of a colour vector in CMY unit cube, which is still considered white. verbose: Bool/str, optional, default: False Verbosity level: - False: silent run - True: report textual information - Path prefix: report textual information and report graphical information in verbose+<suffix>.jpg files Returns ------- strain: array [height, width] Spatial distribution of the stain confidence: float Confidence level that the exp[ected stain is actually present in the image [0..1] pigment: array [height, width] Spatial distribution of the pigment colours: array [3, 3] RGB colours estimated from the image. If a colour is not present it is set to white """ img_ = img.copy() img = 1 - img/255. c = 1 - np.vstack([colour for colour, p in expected])/255. c_save = c.copy() norms = np.array([la.norm(c[0], 2), la.norm(c[1], 2), la.norm(c[2], 2)]) _, d, _ = la.svd([clr/n for clr, n in zip(c, norms) if n > 1.e-2]) if d[-1]/d[0] < .05 and c.shape[0] > 1: dist = [] for i in range(1, c.shape[0]): comb = (0, i) if np.all(norms[list(comb)] > 0.01): dist += [(comb, la.norm(c[comb[0]]/norms[comb[0]] - c[comb[1]]/norms[comb[1]] ,2))] farthest = list(max(dist, key= lambda x: x[1])[0]) norms_ = np.zeros_like(norms) norms_[farthest] = norms[farthest] norms = norms_ c = np.array([clr/n for clr, n in zip(c, norms) if n > 1.e-2]) prior = np.array([p for colour, p in expected])[norms > 1.e-2] X = img[(mask == 1)].reshape(-1, 3) np.random.shuffle(X) X = X[:500000] # non_white_x = np.sum(X**2, axis=-1) > white_threshold**2 if np.all(~non_white_x): c_est = np.ones((len(norms), c.shape[1])) c_est[norms > 1.e-2] -= c return np.zeros(img.shape[:2]), 0, np.zeros(img.shape[:2]), np.uint8(np.maximum(0, np.minimum(1, c_est))*255) #estimating number of independent components infos = [] icamodels = [] n_comp = 3 for j in range(n_comp): w0 = np.ones((1, 1)) if j > 0: rot = np.ones((2, 2))*np.sqrt(0.5) rot[1, 0] *= -1 w0 = np.eye(j + 1) for i in range(j): R = np.eye(j + 1) R[np.kron(np.arange(j + 1) != 2 - i, np.arange(j + 1) != 2 - i).reshape(R.shape)] = rot.ravel() w0 = w0.dot(R) ws = [np.eye(j + 1), w0] icas = [] for k, w in enumerate(ws): ica = FastICA(j + 1, fun='exp', max_iter=500, w_init=w) res = ica.fit_transform(X)# if type(verbose) == str: res_ = ica.transform(img.reshape(-1, 3)) ms = np.mean(res, axis=0) ss = np.std(res, axis=0) kln = 0 for i, d in enumerate(res.T): q, bins = np.histogram(d, bins=50) q = np.asarray(q, dtype=float) + 1. q /= np.sum(q) p = st.norm(ms[i], ss[i]).pdf(.5*(bins[:-1] + bins[1:])) p /= np.sum(p) kl = st.entropy(p, q) # kln += kl icas += [(kln, ica)] info, ica = sorted(icas, key=lambda x : -x[0])[0] infos += [info/(j + 1)] icamodels += [ica] n_comp = min(c.shape[0], np.argmax(infos) + 1) rerun = True c_initial = c.copy() while rerun: rerun = False c = c_initial.copy() ica = icamodels[n_comp - 1] res = ica.transform(X) s0 = ica.transform(np.zeros((1, X.shape[1]))) res -= s0 adj_ica = icamodels[1] dirs = adj_ica.mixing_.T.copy() dirs /= np.sqrt(np.sum(dirs**2, axis=-1))[:, np.newaxis] icas = np.cross(dirs[0], dirs[1]) #deciding on which expected components are present models = [] for ind in it.combinations(range(c.shape[0]), n_comp): S = np.array([[np.corrcoef(np.vstack([es, ex]))[0, 1] for es in res.T] for ex in (X.dot(la.pinv(c[list(ind)]))).T]) sc = np.abs(la.det(S)) acs = 0 if n_comp == 2 and c.shape[0] > 2: acs = np.abs(icas.dot(np.cross(c[ind[0]], c[ind[1]]))) models += [(sc + 10*acs, S, ind)] if c.shape[0] != n_comp: stain_score, corrs, best_ind = sorted([(sc, S, ind) for sc, S, ind in models if 0 in ind], key = lambda x: x[0])[-1] else: stain_score, corrs, best_ind = models[0] best_ind = sorted(list(best_ind)) # adjusting expected colours if 1 in best_ind: adj_ind = [0, 1] adj_ica = icamodels[1] dirs = adj_ica.mixing_.T.copy() dirs /= np.sqrt(np.sum(dirs**2, axis=-1))[:, np.newaxis] icas = np.cross(dirs[0], dirs[1]) rot_axis = c[adj_ind].mean(axis=0) rot_axis /= la.norm(rot_axis, 2) angles = np.arange(-15., 16.)/180 * np.pi rotated = np.zeros((angles.shape[0], 3)) rotCMY = np.zeros((angles.shape[0], len(adj_ind), 3)) length = c[adj_ind].dot(rot_axis) project = c[adj_ind] - c[adj_ind].dot(rot_axis)[:, np.newaxis] * rot_axis[np.newaxis, :] e1 = project[np.argmax(np.sum(project**2, axis=1))].copy() e1 /= la.norm(e1, 2) e = np.array([e1, np.cross(rot_axis, e1)]) project = e.dot(project.T) for i, a in enumerate(angles): A = e.T.dot( np.array([[np.cos(a), np.sin(a)] ,[-np.sin(a), np.cos(a)]])) rotCMY[i] = (A.dot(project) + length[np.newaxis, :]*rot_axis[:, np.newaxis]).T rotated[i] = np.cross(rotCMY[i, 0], rotCMY[i, 1]) acs = np.abs(rotated.dot(icas)) c[adj_ind] = rotCMY[np.argmax(acs)] if verbose: print "Adjusting expected colours: rotation angle = ", angles[np.argmax(acs)]/np.pi * 180 # choosing the best decomposition ps = np.abs(corrs) P = np.zeros_like(ps) sh = np.array(P.shape) # - 1 s = min(sh) best_score = np.inf best_est = 0 pr = prior[best_ind] X_ = X[non_white_x].T - ica.mean_[:, np.newaxis] dirs = ica.mixing_.T.copy() dirs /= np.sqrt(np.sum(dirs**2, axis=-1))[:, np.newaxis] dist2 = np.array([np.sum((np.eye(3) - ci[:, np.newaxis].dot(ci[np.newaxis, :])).dot(X_)**2, axis=0) for ci in dirs]) comps_prjs_ = dirs.dot(X_) pw = np.exp(-dist2/(2*0.05**2))*comps_prjs_ for i0 in it.combinations(range(max(sh)), max(sh) - min(sh)): sl = [r for r in range(sh[0]) if r not in i0] for i1 in it.permutations(range(s), s): Q = np.zeros(sh) I = np.eye(s) Q[sl] = I[list(i1)] for i2 in range(s + 1): for i3 in it.combinations(range(s), i2): ones_ = np.ones(s) ones_[list(i3)] *= -1 R = Q * ones_ comps = R.dot(dirs) comps_prjs = R.dot(comps_prjs_) est = np.diag(comps_prjs[np.arange(comps_prjs.shape[0]), np.argmax(R.dot(pw), axis = -1)]).dot(comps) + ica.mean_ est /= np.sqrt(np.sum(est**2, axis=-1))[:, np.newaxis] Dist = np.sqrt(np.sum((c[best_ind][(np.arange(len(best_ind)**2)%len(best_ind)).reshape((len(best_ind),)*2).T.ravel()] - est[np.arange(len(best_ind)**2)%len(best_ind)])**2, axis=-1)).reshape((len(best_ind),)*2) if Dist.shape[0] > 1: sc = np.mean(np.array([Dist[I[:, i4] == 1, i4]**2/np.mean(Dist[I[:, i4] == 0, i4]) for i4 in range(Dist.shape[0])]).ravel()*pr) else: sc = np.mean(np.array([Dist[I[:, i4] == 1, i4]**2 for i4 in range(Dist.shape[0])]).ravel()*pr) if sc < best_score: best_score = sc P = R best_est = est _, d, _ = la.svd(c) mean_check = np.maximum(0, (ica.mean_/la.norm(ica.mean_, 2)).dot(la.pinv(c))) mean_check = mean_check/np.sum(mean_check) if( d[-1]/d[0] > 0.05 and mean_check[1] > 5*mean_check[np.arange(mean_check.shape[0]) != 1].max() and np.all(np.abs(best_est - (ica.mean_/la.norm(ica.mean_, 2))[np.newaxis, :]) < white_threshold) ): conf = 0 if 1 not in best_ind: best_ind = sorted(best_ind + [1]) best_est_ = np.zeros((best_est.shape[0] + 1, best_est.shape[1])) best_est_[1:] = best_est best_est = best_est_ n_comp += 1 best_est[0] = np.zeros(3) #decomposing image and checking the result res = img.reshape(-1, 3).dot(la.pinv(best_est)) _, d, _ = la.svd(best_est) if n_comp == 3 and d[-1]/d[0] < 0.05 and la.norm(best_est[0], 2) > 0: infos[-1] = 0 n_comp = np.argmax(infos) + 1 rerun = True if verbose: print "Rerun on unstable decomposition" print "singular values", d, d/d[0] print "infos", infos, n_comp continue if n_comp == 3 and len(np.where(np.vstack([c[1], best_est[1]]).dot(np.cross(best_est[0], best_est[2])) < 0)[0]) % 2 == 1: best_ind = [0, 2] best_est = best_est[best_ind] n_comp = 2 res = img.reshape(-1, 3).dot(la.pinv(best_est)) if verbose : print 'Dropping the pigment due to inconsistency' stain_m = c[ 0] - np.ones(3)/la.norm(np.ones(3), 2) stain_m /= la.norm(stain_m, 2) backg_m = c[-1] - np.ones(3)/la.norm(np.ones(3), 2) backg_m /= la.norm(backg_m, 2) def mode(data): h, b = np.histogram(data, bins=50) m = np.argmax(h) return .5*(b[m] + b[m+1]) if( n_comp == 1 and mode(res[:, 0].reshape(img.shape[:2])[mask == 0]) > 1.0*mode(res[:, 0].reshape(img.shape[:2])[mask == 1]) and len(c) > 1 and stain_m.dot(backg_m) < .95 ): infos[0] = 0 n_comp = min(c.shape[0], np.argmax(infos) + 1) rerun = True if verbose: print "Rerun on weak stain" print "stain_m.dot(backg_m)", stain_m.dot(backg_m) continue #Checking if pigment is confused with saturated stain if n_comp > 1 and 1 in best_ind: best_est_ = best_est.copy() best_est_[0] = np.ones(3)/np.sqrt(3) pigm_ = img.reshape(-1, 3).dot(la.pinv(best_est)) pigm = pigm_[mask.ravel() == 1][:, 1] th = np.percentile(pigm_[mask.ravel() == 1, 1], 99) sqlens = (img**2).reshape(-1, 3).sum(axis=-1).reshape(img.shape[:2]) darkest = img[(mask == 1) & (sqlens >= np.percentile(sqlens[mask == 1], 99.5))].reshape(-1, 3) pigm_ = img.reshape(-1, 3).dot(la.pinv(best_est_)) _, d_, _ = la.svd(best_est_) dark_part = np.zeros(img.shape[:2]) dark_part[sqlens >= np.sqrt(3)*(1 - white_threshold)] = sqlens[sqlens >= np.sqrt(3)*(1 - white_threshold)] dark_part[mask == 0] = 0 plt.figure() ax=plt.subplot(121) toshow = np.zeros(img.shape[:2]) toshow[mask == 1] = pigm ax.imshow(toshow) ax=plt.subplot(122) ax.imshow(dark_part) if (d_[-1]/d_[0] < 0.01 or np.corrcoef(np.vstack([pigm, dark_part[mask == 1].ravel()]))[0,1] > 0.75 ) and np.any(np.abs(1 - darkest.mean(axis=0)) < white_threshold): best_ind = [ind for ind in best_ind if ind != 1] best_est = best_est[best_ind] n_comp = len(best_ind) res = img.reshape(-1, 3).dot(la.pinv(best_est)) if verbose: print 'Dropping pigment due to saturated stain' check = np.maximum(0, best_est[0].dot(la.pinv(c[best_ind]))) conf = check[0]/np.sum(check) if type(verbose) == str: data = X steps = 15 stainHist, _ = np.histogramdd(data, bins = [np.arange(np.min(data), np.max(data) + (np.max(data) - np.min(data))/steps, (np.max(data) - np.min(data))/steps)]*3) colors = [] sizes = [] for i in range(steps): for j in range(steps): for k in range(steps): colors += [[(i + 0.5)*((np.max(data) - np.min(data))/steps) + np.min(data) , (j + 0.5)*((np.max(data) - np.min(data))/steps) + np.min(data) , (k + 0.5)*((np.max(data) - np.min(data))/steps) + np.min(data)]] sizes += [stainHist[i, j, k]] colorsCMY = np.array(colors) sizes = np.array(sizes) colorsRGB = 1. - colorsCMY fig = plt.figure(figsize=(24, 20)) ax = fig.add_subplot(111, projection='3d') ax.scatter(colorsCMY[:, 0][(sizes > 0) ] , colorsCMY[:, 1][(sizes > 0) ] , colorsCMY[:, 2][(sizes > 0) ] , s=np.log(sizes[ (sizes > 0) ] + 1)*50 , c = colorsRGB[ (sizes > 0) ]) limits = (ax.get_xlim(), ax.get_ylim(), ax.get_zlim()) ax.scatter([ica.mean_[0]], [ica.mean_[1]], [ica.mean_[2]], c = 'k', marker='+') p0, p1 = (ica.mean_ - .67*ica.mixing_.dot(P.T).T[0]/la.norm(ica.mixing_.dot(P.T).T[0], 2)), (ica.mean_ + .67*ica.mixing_.dot(P.T).T[0]/la.norm(ica.mixing_.dot(P.T).T[0], 2)) ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1) if ica.mixing_.shape[1] > 1: p0, p1 = (ica.mean_ - .67*ica.mixing_.dot(P.T).T[1]/la.norm(ica.mixing_.dot(P.T).T[1], 2)), (ica.mean_ + .67*ica.mixing_.dot(P.T).T[1]/la.norm(ica.mixing_.dot(P.T).T[1], 2)) ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1) if ica.mixing_.shape[1] > 2: p0, p1 = (ica.mean_ - .67*ica.mixing_.dot(P.T).T[2]/la.norm(ica.mixing_.dot(P.T).T[2], 2)), (ica.mean_ + .67*ica.mixing_.dot(P.T).T[2]/la.norm(ica.mixing_.dot(P.T).T[2], 2)) ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1) ax.set_xlim(limits[0]) ax.set_ylim(limits[1]) ax.set_zlim(limits[2]) ax.set_xlabel('C') ax.set_ylabel('M') ax.set_zlabel('Y') pca2 = PCA(2).fit(X) n = np.cross(pca2.components_[0], pca2.components_[1]) n /= la.norm(n, 2) az = np.arctan2(n[0], n[1])*90./np.pi el = np.arccos(n[2])*90./np.pi ax.view_init(elev=el, azim=az) ver_name = verbose+'.independent_axes.jpg' fig.savefig(ver_name) print "saving", ver_name dirs = ica.mixing_.T.copy() dirs /= np.sqrt(np.sum(dirs**2, axis=-1))[:, np.newaxis] dist2 = np.array([np.sum((np.eye(3) - ci[:, np.newaxis].dot(ci[np.newaxis, :])).dot(colorsCMY.T - ica.mean_[:, np.newaxis])**2, axis=0) for ci in dirs]) pw_ = np.exp(-dist2/(2*0.05**2)) pwth = 0.5 fig = plt.figure(figsize=(24, 20)) ax = fig.add_subplot(111, projection='3d') ax.scatter(colorsCMY[:, 0][(sizes > 0) & np.any(pw_ > pwth, axis=0)] , colorsCMY[:, 1][(sizes > 0) & np.any(pw_ > pwth, axis=0)] , colorsCMY[:, 2][(sizes > 0) & np.any(pw_ > pwth, axis=0)] , s=np.log(sizes[ (sizes > 0) & np.any(pw_ > pwth, axis=0)] + 1)*50 , c = colorsRGB[ (sizes > 0) & np.any(pw_ > pwth, axis=0)]) R = np.eye(P.shape[0]) if P.shape[0] > n_comp: R[n_comp:] = 0 R[0] *= -1 if 0 in best_ind: est = np.diag(comps_prjs_[np.arange(comps_prjs_.shape[0]), np.argmax(R.dot(pw), axis = -1)]).dot(dirs) + ica.mean_ est /= np.sqrt(np.sum(est**2, axis=-1))[:, np.newaxis] p0, p1 = np.zeros(3) , est[0] ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 1 - np.maximum(0, p1), linewidth=2) if n_comp > 1: p0, p1 = np.zeros(3) , est[1] ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 1 - np.maximum(0, p1), linewidth=2) if n_comp > 2: p0, p1 = np.zeros(3) , est[2] ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 1 - np.maximum(0, p1), linewidth=2) ax.scatter([ica.mean_[0]], [ica.mean_[1]], [ica.mean_[2]], c = 'k', marker='+') p0, p1 = (ica.mean_ - .67*ica.mixing_.T[0]/la.norm(ica.mixing_.T[0], 2)), (ica.mean_ + .67*ica.mixing_.T[0]/la.norm(ica.mixing_.T[0], 2)) ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1, alpha=.67) if ica.mixing_.shape[1] > 1: p0, p1 = (ica.mean_ - .67*ica.mixing_.T[1]/la.norm(ica.mixing_.T[1], 2)), (ica.mean_ + .67*ica.mixing_.T[1]/la.norm(ica.mixing_.T[1], 2)) ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1, alpha=.67) if ica.mixing_.shape[1] > 2: p0, p1 = (ica.mean_ - .67*ica.mixing_.T[2]/la.norm(ica.mixing_.T[2], 2)), (ica.mean_ + .67*ica.mixing_.T[2]/la.norm(ica.mixing_.T[2], 2)) ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1, alpha=.67) class Arrow3D(FancyArrowPatch): def __init__(self, xs, ys, zs, *args, **kwargs): FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs) self._verts3d = xs, ys, zs def draw(self, renderer): xs3d, ys3d, zs3d = self._verts3d xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M) self.set_positions((xs[0],ys[0]),(xs[1],ys[1])) FancyArrowPatch.draw(self, renderer) p0, p1 = ica.mean_, (ica.mean_ + .15*ica.mixing_.dot(R.T).T[0]/la.norm(ica.mixing_.dot(R.T).T[0], 2)) a = Arrow3D([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], mutation_scale=20, lw=2, arrowstyle="-|>", color="b") ax.add_artist(a) if ica.mixing_.shape[1] > 1: p0, p1 = ica.mean_, (ica.mean_ + .15*ica.mixing_.dot(R.T).T[1]/la.norm(ica.mixing_.dot(R.T).T[1], 2)) a = Arrow3D([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], mutation_scale=20, lw=2, arrowstyle="-|>", color="b") ax.add_artist(a) if ica.mixing_.shape[1] > 2: p0, p1 = ica.mean_, (ica.mean_ + .15*ica.mixing_.dot(R.T).T[2]/la.norm(ica.mixing_.dot(R.T).T[2], 2)) a = Arrow3D([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], mutation_scale=20, lw=2, arrowstyle="-|>", color="b") ax.add_artist(a) ax.set_xlim(limits[0]) ax.set_ylim(limits[1]) ax.set_zlim(limits[2]) ax.set_xlabel('C') ax.set_ylabel('M') ax.set_zlabel('Y') ax.view_init(elev=el, azim=az) ver_name = verbose+'.proposing_colours.jpg' fig.savefig(ver_name) print "saving", ver_name fig = plt.figure(figsize=(24, 20)) ax = fig.add_subplot(111, projection='3d') R = P if P.shape[0] > n_comp: R[n_comp:] = 0 est = best_est p0, p1 = np.zeros(3) , est[0] ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 1 - np.maximum(0, p1), linewidth=2) if n_comp > 1: p0, p1 = np.zeros(3) , est[1] ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 1 - np.maximum(0, p1), linewidth=2) if n_comp > 2: p0, p1 = np.zeros(3) , est[2] ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 1 - np.maximum(0, p1), linewidth=2) ax.scatter([ica.mean_[0]], [ica.mean_[1]], [ica.mean_[2]], c = 'k', marker='+') p0, p1 = (ica.mean_ - .67*ica.mixing_.T[0]/la.norm(ica.mixing_.T[0], 2)), (ica.mean_ + .67*ica.mixing_.T[0]/la.norm(ica.mixing_.T[0], 2)) ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1, alpha=.67) if ica.mixing_.shape[1] > 1: p0, p1 = (ica.mean_ - .67*ica.mixing_.T[1]/la.norm(ica.mixing_.T[1], 2)), (ica.mean_ + .67*ica.mixing_.T[1]/la.norm(ica.mixing_.T[1], 2)) ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1, alpha=.67) if ica.mixing_.shape[1] > 2: p0, p1 = (ica.mean_ - .67*ica.mixing_.T[2]/la.norm(ica.mixing_.T[2], 2)), (ica.mean_ + .67*ica.mixing_.T[2]/la.norm(ica.mixing_.T[2], 2)) ax.plot([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], c = 'k', linewidth=1, alpha=.67) p0, p1 = ica.mean_, (ica.mean_ + .15*ica.mixing_.dot(R.T).T[0]/la.norm(ica.mixing_.dot(R.T).T[0], 2)) a = Arrow3D([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], mutation_scale=20, lw=2, arrowstyle="-|>", color="b") ax.add_artist(a) if ica.mixing_.shape[1] > 1: p0, p1 = ica.mean_, (ica.mean_ + .15*ica.mixing_.dot(R.T).T[1]/la.norm(ica.mixing_.dot(R.T).T[1], 2)) a = Arrow3D([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], mutation_scale=20, lw=2, arrowstyle="-|>", color="b") ax.add_artist(a) if ica.mixing_.shape[1] > 2: p0, p1 = ica.mean_, (ica.mean_ + .15*ica.mixing_.dot(R.T).T[2]/la.norm(ica.mixing_.dot(R.T).T[2], 2)) a = Arrow3D([p0[0], p1[0]], [p0[1], p1[1]], [p0[2], p1[2]], mutation_scale=20, lw=2, arrowstyle="-|>", color="b") ax.add_artist(a) ax.plot([0, c[best_ind[0], 0]], [0., c[best_ind[0], 1]], [0., c[best_ind[0], 2]], c = 1-c[best_ind[0]], ls='--', linewidth=2) if c[best_ind].shape[0] > 1: ax.plot([0, c[best_ind[1], 0]], [0., c[best_ind[1], 1]], [0., c[best_ind[1], 2]], c = 1-c[best_ind[1]], ls='--', linewidth=2) if c[best_ind].shape[0] > 2: ax.plot([0, c[best_ind[2], 0]], [0., c[best_ind[2], 1]], [0., c[best_ind[2], 2]], c = 1-c[best_ind[2]], ls='--', linewidth=2) ax.set_xlim(limits[0]) ax.set_ylim(limits[1]) ax.set_zlim(limits[2]) ax.set_xlabel('C') ax.set_ylabel('M') ax.set_zlabel('Y') ax.view_init(elev=el, azim=az) ver_name = verbose+'.classifying_colours.jpg' fig.savefig(ver_name) print "saving", ver_name if verbose : print "infos", infos print "check", check print 'conf', conf pigm = np.zeros_like(res[:, 0]) if 1 in best_ind: pigm = res[:, 1] c_est = np.ones((len(norms), c.shape[1])) c_est_ = np.ones_like(c) c_est_[best_ind] -= best_est c_est[norms > 1.e-2] = c_est_ return res[:, 0].reshape(img.shape[:2]), conf, pigm.reshape(img.shape[:2]), np.uint8(np.maximum(0, np.minimum(1, c_est))*255)
def fit_and_score(expression_filename, treatment_filename, golden_filename): expressions = csv_map( expression_filename, header_method=lambda j, entry: genecoder.encode(entry), entry_method=lambda i, j, entry, d: d.__setitem__((i, j), entry), cleanup_method=collect_in_array) from scipy.stats import pearsonr, spearmanr correlations = np.zeros((genecoder.total_seen(), genecoder.total_seen()), dtype=np.float64) for i in range(genecoder.total_seen()): for j in range(i, genecoder.total_seen()): # correlate columns of each gene pair in expression matrix if i == j: correlations[i, i] = 1.0 continue (r, pval) = pearsonr(expressions[:, i], expressions[:, j]) correlations[i, j] = r correlations[j, i] = r # Build dimension-reduced model of correlation data from sklearn.decomposition import FastICA nmf = FastICA(n_components=120) transformed_correlations = nmf.fit_transform(correlations) # print(expressions) print('Expressions shape: ', expressions.shape) nne = NearestNeighbors(n_neighbors=5, radius=0.1, algorithm='auto', metric='manhattan', n_jobs=4) nne.fit(transformed_correlations) # Build nearest-neighbor index of chip treatments treatments = csv_map(treatment_filename, row_method=treatment_row_method, cleanup_method=collect_treatments) nnt = NearestNeighbors(n_neighbors=5, radius=0.1, algorithm='auto', metric='jaccard', n_jobs=4) nnt.fit(treatments) # print(treatments) print("Treatments shape: ", treatments.shape) sparse_goldens, golden_i_indices, golden_j_indices = csv_map( golden_filename, row_method=golden_row_method, cleanup_method=collect_goldens) goldens = sparse_goldens.toarray() # print(goldens) from sklearn.mixture import BayesianGaussianMixture def correlation_modes(ex): modes = BayesianGaussianMixture(n_components=3) modes.fit(ex.flatten().reshape(-1, 1)) expression_centers = modes.means_ (anticorrelated, uncorrelated, correlated) = sorted(expression_centers) return (anticorrelated, uncorrelated, correlated) (anticorrelated, uncorrelated, correlated) = [-1, 0, 1] print("Correlation level modes: ", anticorrelated, uncorrelated, correlated) # predict the goldens # - compute overall correlation of gene expressions across all experiments # - transform the correlation data to the nmf space # - synthesize a probe vector by setting that gene's element to its max correlation level in the data and rest to zero # - get nearest neighbors of probe in nmf and treatment spaces (must be near in both) # - average together nmf representations of those rows # - transform back to expression space and threshold; these are predictions # - compute AUROC vs golden array predicted_correlations = zeros( (genecoder.total_seen(), genecoder.total_seen()), dtype=np.float64) predicted_relationships = zeros( (genecoder.total_seen(), genecoder.total_seen()), dtype=np.bool_) for i in range(genecoder.total_seen()): genevector = zeros((1, genecoder.total_seen())) genevector[0, i] = np.max(expressions[:, i]) transformed_genevector = nmf.transform(genevector) common_inds = [] ex_neighbors = 5 t_neighbors = 3 (nmf_dist, nmf_neighbor_inds) = nne.kneighbors( transformed_genevector, min(expressions.shape[0], ex_neighbors), True) # (cnd_dist, cnd_neighbor_inds) = nnt.kneighbors(treatments[nmf_neighbor_inds], min(treatments.shape[0], t_neighbors), True) # common_inds = np.intersect1d(nmf_neighbor_inds, cnd_neighbor_inds, assume_unique=False) common_inds = nmf_neighbor_inds rows_to_average = transformed_correlations.take(common_inds, axis=0) average_transformed_correlation = np.average(rows_to_average, axis=1)[0] if i % 100 == 0: stdout.write("\nAveraging transformed expressions for row %d." % i) stdout.flush() else: stdout.write('.') stdout.flush() # print("Average transformed correlation for row %d: \n" % i, average_transformed_correlation.shape) average_correlation_prediction = nmf.inverse_transform( [average_transformed_correlation]) # print("\nMax predicted correlation vector component: ", max(average_expression_prediction)) predicted_correlations[i] = average_correlation_prediction golden_nonzero_count = np.count_nonzero(goldens.flatten()) def topcomponents(vec, num_components=3): return sorted(enumerate(vec), key=lambda x: x[1], reverse=True)[0:num_components] golden_i_set = set(golden_i_indices) golden_j_set = set(golden_j_indices) print("Golden i set size: %d" % len(golden_i_set)) for j in range(predicted_correlations.shape[1]): for i in range(predicted_correlations.shape[0]): p = predicted_correlations[i, j] r = True if abs(correlated - p) < abs(uncorrelated - p) or abs( anticorrelated - p) < abs(uncorrelated - p) else False predicted_relationships[i, j] = r # print(predicted_relationships) auroc = roc_auc_score( goldens[golden_i_indices, golden_j_indices], predicted_relationships[golden_i_indices, golden_j_indices]) print("AUROC: ", auroc) print('Golden nonzero count: ', golden_nonzero_count) print( 'Prediction nonzero count on golden set: ', np.count_nonzero(predicted_relationships[golden_i_indices, golden_j_indices])) print('Prediction nonzero count on all genes: ', np.count_nonzero(predicted_relationships.flatten()))
def main(addNoise = 0, savedir = None, doFastICA = False): N = 200 tt = linspace(0, 10, N) # make sources s1 = 4 + cos(tt*5) s2 = tt % 2 s1 -= mean(s1) s1 /= std(s1) s2 -= mean(s2) s2 /= std(s2) pyplot.figure(1) pyplot.subplot(4,1,1) pyplot.title('original sources') pyplot.plot(tt, s1, 'bo-') pyplot.subplot(4,1,2) pyplot.plot(tt, s2, 'bo-') A = array([[3, 1], [-2, .3]]) S = vstack((s1, s2)).T #print 'S', S print 'kurt(s1) =', kurt(s1) print 'kurt(s2) =', kurt(s2) print ' negentropy(s1) =', negentropy(s1) print ' negentropy(s2) =', negentropy(s2) print ' logcosh10(s1) =', logcosh10(s1) print ' logcosh10(s2) =', logcosh10(s2) print ' logcosh15(s1) =', logcosh15(s1) print ' logcosh15(s2) =', logcosh15(s2) print ' logcosh20(s1) =', logcosh20(s1) print ' logcosh20(s2) =', logcosh20(s2) print ' negexp(s1) =', negexp(s1) print ' negexp(s2) =', negexp(s2) X = dot(S, A) if addNoise > 0: print 'Adding noise!' X += random.normal(0, addNoise, X.shape) #print 'X', X x1 = X[:,0] x2 = X[:,1] #print 'kurt(x1) =', kurt(x1) #print 'kurt(x2) =', kurt(x2) pyplot.subplot(4,1,3) pyplot.title('observed signal') pyplot.plot(tt, x1, 'ro-') pyplot.subplot(4,1,4) pyplot.plot(tt, x2, 'ro-') pyplot.figure(2) pyplot.subplot(4,1,1) pyplot.title('original sources') pyplot.hist(s1) pyplot.subplot(4,1,2) pyplot.hist(s2) pyplot.subplot(4,1,3) pyplot.title('observed signal') pyplot.hist(x1) pyplot.subplot(4,1,4) pyplot.hist(x2) pca = PCA(X) #W = pca.toWhitePC(X) W = pca.toZca(X) w1 = W[:,0] w2 = W[:,1] print 'kurt(w1) =', kurt(w1) print 'kurt(w2) =', kurt(w2) pyplot.figure(3) pyplot.subplot(4,2,1) pyplot.title('observed signal') pyplot.hist(x1) pyplot.subplot(4,2,3) pyplot.hist(x2) pyplot.subplot(2,2,2) pyplot.plot(x1, x2, 'bo') pyplot.subplot(4,2,5) pyplot.title('whitened observed signal') pyplot.hist(w1) pyplot.subplot(4,2,7) pyplot.hist(w2) pyplot.subplot(2,2,4) pyplot.plot(w1, w2, 'bo') # Compute kurtosis at different angles thetas = linspace(0, pi, 100) kurt1 = 0 * thetas for ii, theta in enumerate(thetas): kurt1[ii] = kurt(dot(rotMat(theta)[0,:], W.T).T) # functions of data minfnK = lambda data: -kurt(data)**2 minfnNEnt = lambda data: -negentropy(data) minfnLC10 = lambda data: -logcosh10(data) minfnLC15 = lambda data: -logcosh15(data) minfnLC20 = lambda data: -logcosh20(data) minfnNExp = lambda data: -negexp(data) # functions of the rotation angle, given W as the data minAngleFnK = lambda theta: minfnK(dot(rotMat(theta)[0,:], W.T).T) minAngleFnNEnt = lambda theta: minfnNEnt(dot(rotMat(theta)[0,:], W.T).T) minAngleFnLC10 = lambda theta: minfnLC10(dot(rotMat(theta)[0,:], W.T).T) minAngleFnLC15 = lambda theta: minfnLC15(dot(rotMat(theta)[0,:], W.T).T) minAngleFnLC20 = lambda theta: minfnLC20(dot(rotMat(theta)[0,:], W.T).T) minAngleFnNExp = lambda theta: minfnNExp(dot(rotMat(theta)[0,:], W.T).T) ######### # Chosen objective function. Change this line to change which objective is used. ######### minDataFn = minfnK minAngleFn = lambda theta: minDataFn(dot(rotMat(theta)[0,:], W.T).T) if doFastICA: # Use FastICA from sklearn #pdb.set_trace() from sklearn.decomposition import FastICA rng = random.RandomState(1) ica = FastICA(random_state = rng, whiten = False) ica.fit(W) Recon = ica.transform(W) # Estimate the sources #S_fica /= S_fica.std(axis=0) # (should already be done) Ropt = ica.get_mixing_matrix() else: # Manually fit angle using fmin_bfgs angle0 = 0 xopt = fmin_bfgs(minAngleFn, angle0) xopt = xopt[0] % pi Ropt = rotMat(xopt) Recon = dot(W, Ropt.T) mnval = array([minAngleFn(aa) for aa in thetas]) pyplot.figure(4) pyplot.title('objective vs. angle') #pyplot.plot(thetas, kurt1, 'bo-', thetas, mnval, 'k', xopt, minAngleFn(xopt), 'ko') pyplot.plot(thetas, mnval, 'b') if not doFastICA: pyplot.hold(True) pyplot.plot(xopt, minAngleFn(xopt), 'ko') pyplot.figure(5) pyplot.title('different gaussianness measures vs. angle') pyplot.subplot(6,1,1); pyplot.title('Kurt'); pyplot.plot(thetas, array([minAngleFnK(aa) for aa in thetas])) pyplot.subplot(6,1,2); pyplot.title('NegEnt'); pyplot.plot(thetas, array([minAngleFnNEnt(aa) for aa in thetas])) pyplot.subplot(6,1,3); pyplot.title('LogCosh10'); pyplot.plot(thetas, array([minAngleFnLC10(aa) for aa in thetas])) pyplot.subplot(6,1,4); pyplot.title('LogCosh15'); pyplot.plot(thetas, array([minAngleFnLC15(aa) for aa in thetas])) pyplot.subplot(6,1,5); pyplot.title('LogCosh20'); pyplot.plot(thetas, array([minAngleFnLC20(aa) for aa in thetas])) pyplot.subplot(6,1,6); pyplot.title('NegExp'); pyplot.plot(thetas, array([minAngleFnNExp(aa) for aa in thetas])) print 'kurt(r1) =', kurt(Recon[:,0]) print 'kurt(r2) =', kurt(Recon[:,1]) print print 'objective(s1) =', minDataFn(s1) print 'objective(s2) =', minDataFn(s2) print 'objective(w1) =', minDataFn(w1) print 'objective(w2) =', minDataFn(w2) print 'objective(r1) =', minDataFn(Recon[:,0]) print 'objective(r2) =', minDataFn(Recon[:,1]) print 'optimal theta:', if doFastICA: print '<not computed with FastICA>' else: print xopt, '(+pi/2 =', (xopt+pi/2)%pi, ')' print 'Optimal rotation matrix:\n', Ropt pyplot.figure(6) pyplot.subplot(4,1,1) pyplot.title('original sources') pyplot.plot(tt, s1, 'bo-') pyplot.subplot(4,1,2) pyplot.plot(tt, s2, 'bo-') pyplot.subplot(4,1,3) pyplot.title('reconstructed sources') pyplot.plot(tt, Recon[:,0], 'go-') pyplot.subplot(4,1,4) pyplot.plot(tt, Recon[:,1], 'go-') #pyplot.show() if savedir: figname = lambda ii : os.path.join(savedir, 'figure_%02d.png' % ii) for ii in range(6): pyplot.figure(ii+1) pyplot.savefig(figname(ii+1)) print 'plots saved in', savedir else: import ipdb; ipdb.set_trace()
def main(mode): path = '/local/attale00/AFLW_ALL' path_ea = path+'/color128/' fileNames = utils.getAllFiles(path_ea); labs=utils.parseLabelFiles(path+'/labels/labels','mouth_opening',fileNames,cutoffSeq='.png',suffix='_face0.labels') testSet = fg.dataContainer(labs) testSetMirror = fg.dataContainer(labs) for f in range(len(testSetMirror.fileNames)): testSetMirror.fileNames[f]+='M' roi=(50,74,96,160) X=fg.getAllImagesFlat(path_ea,testSet.fileNames,(128,256),roi=roi) Y=fg.getAllImagesFlat(path+'/mirror128/',testSet.fileNames,(128,256),roi=roi) Z=np.concatenate((X,Y),axis=0) # perform ICA ica = FastICA(n_components=100,whiten=True) ica.fit(Z) meanI=np.mean(Z,axis=0) X1=X-meanI Y1=Y-meanI data=ica.transform(X1) datam=ica.transform(Y1) filters=ica.components_ for i in range(len(fileNames)): testSet.data[i].extend(data[i,:]) testSetMirror.data[i].extend(datam[i,:]) strel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(3,3)) #fg.getHogFeature(testSet,roi,path=path_ea,ending='.png',extraMask = None,orientations = 3, cells_per_block=(6,2),maskFromAlpha=False) #fg.getColorHistogram(testSet,roi,path=path_ea,ending='.png',colorspace='lab',bins=10) #pca # n_samples, n_features = X.shape # # mean_ = np.mean(X, axis=0) # X -= mean_ # U, S, V = linalg.svd(X) # explained_variance_ = (S ** 2) / n_samples # explained_variance_ratio_ = (explained_variance_ /explained_variance_.sum()) # K=V / S[:, np.newaxis] * np.sqrt(n_samples) # filters=K[:100] # data=np.dot(X,filters.T) testSet.addContainer(testSetMirror) testSet.targetNum=map(utils.mapMouthLabels2Two,testSet.target) rf=classifierUtils.standardRF(max_features = np.sqrt(len(testSet.data[0])),min_split=5,max_depth=40) if mode in ['s','v']: print 'Classifying with loaded classifier' _classifyWithOld(path,testSet,mode) elif mode in ['c']: print 'cross validation of data' classifierUtils.dissectedCV(rf,testSet) elif mode in ['save']: print 'saving new classifier' _saveRF(testSet,rf,filters=filters,meanI=meanI) else: print 'not doing anything'
def nn2(xs, ys, xs_test, ys_test, n_components, clf_constructor): ks = [0 for _ in range(10)] cataccs = [0 for _ in range(10)] ys = [to_categorical(ys[0]), to_categorical(ys[1])] ys_test = [to_categorical(ys_test[0]), to_categorical(ys_test[1])] for i in range(2): shape = np.shape(xs[i])[1] n_components[i] = shape model = utils.create_adult_model( shape, 2) if i == 0 else utils.create_wine_model(shape, 5) model.fit(xs[i][:10000], ys[i][:10000], batch_size=50, epochs=10, verbose=False) cataccs[i] = model.evaluate(xs_test[i], ys_test[i], verbose=False)[1] * 100 for k in range(2, 11): try: clf = clf_constructor(n_clusters=k) except: clf = clf_constructor(n_components=k) for i in range(2): pca = PCA(n_components=n_components[2 + i]) transformed = pca.fit_transform(xs[i]) transformed_test = pca.transform(xs_test[i]) predict = to_categorical(clf.fit_predict(transformed[:10000])) predict_test = to_categorical(clf.predict( transformed_test[:10000])) input_dims = [n_components[2 + i], k] model = utils.create_mi_adult_model( input_dims, 2) if i == 0 else utils.create_mi_wine_model( input_dims, 5) model.fit([transformed[:10000], predict], ys[i][:10000], batch_size=50, epochs=10, verbose=False) catacc = model.evaluate([transformed_test, predict_test], ys_test[i], verbose=False)[1] * 100 if catacc > cataccs[2 + i]: ks[2 + i] = k cataccs[2 + i] = catacc ica = FastICA(n_components=n_components[4 + i]) transformed = ica.fit_transform(xs[i]) transformed_test = ica.transform(xs_test[i]) predict = to_categorical(clf.fit_predict(transformed[:10000])) predict_test = to_categorical(clf.predict( transformed_test[:10000])) input_dims = [n_components[4 + i], k] model = utils.create_mi_adult_model( input_dims, 2) if i == 0 else utils.create_mi_wine_model( input_dims, 5) model.fit([transformed[:10000], predict], ys[i][:10000], batch_size=50, epochs=10, verbose=False) catacc = model.evaluate([transformed_test, predict_test], ys_test[i], verbose=False)[1] * 100 if catacc > cataccs[4 + i]: ks[4 + i] = k cataccs[4 + i] = catacc if i == 1: rp = GaussianRandomProjection(eps=0.95) transformed = rp.fit_transform(xs[i]) transformed_test = rp.transform(xs_test[i]) predict = to_categorical(clf.fit_predict(transformed[:10000])) predict_test = to_categorical( clf.predict(transformed_test[:10000])) input_dims = [np.shape(transformed)[1], k] model = utils.create_mi_wine_model(input_dims, 5) model.fit([transformed[:10000], predict], ys[i][:10000], batch_size=50, epochs=10, verbose=False) catacc = model.evaluate([transformed_test, predict_test], ys_test[i], verbose=False)[1] * 100 if catacc > cataccs[6 + i]: ks[6 + i] = k cataccs[6 + i] = catacc encoder, vae = utils.create_vae( np.shape(xs[i])[1], n_components[8 + i]) vae.fit(xs[i], batch_size=50, epochs=10, verbose=False) transformed = encoder.predict(xs[i], verbose=False) transformed_test = encoder.predict(xs_test[i], verbose=False) predict = to_categorical(clf.fit_predict(transformed[:10000])) predict_test = to_categorical(clf.predict( transformed_test[:10000])) input_dims = [n_components[8 + i], k] model = utils.create_mi_adult_model( input_dims, 2) if i == 0 else utils.create_mi_wine_model( input_dims, 5) model.fit([transformed[:10000], predict], ys[i][:10000], batch_size=50, epochs=10, verbose=False) catacc = model.evaluate([transformed_test, predict_test], ys_test[i], verbose=False)[1] * 100 if catacc > cataccs[8 + i]: ks[8 + i] = k cataccs[8 + i] = catacc plot.style.use('seaborn-darkgrid') plot.title(f'Influence of feature transformation on the NN accuracy') color = [] for _ in range(5): color.append('tab:blue') color.append('tab:orange') x = [] count = 1 for _ in range(5): x.append(count) count += 0.5 x.append(count) count += 1 plot.bar(x, cataccs, color=color, width=0.75) x = [] count = 1.25 for _ in range(5): x.append(count) count += 1.5 plot.xticks(x, ['None', 'PCA', 'ICA', 'RP', 'VAE']) plot.xlabel('Feature transformation method') plot.ylabel('Categorical accuracy (%)') plot.show()
n_comp = 12 # tSVD tsvd = TruncatedSVD(n_components=n_comp, random_state=420) tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1)) tsvd_results_test = tsvd.transform(test) # PCA pca = PCA(n_components=n_comp, random_state=420) pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1)) pca2_results_test = pca.transform(test) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1)) ica2_results_test = ica.transform(test) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(train.drop(["y"], axis=1)) grp_results_test = grp.transform(test) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train.drop(["y"], axis=1)) srp_results_test = srp.transform(test) # Append decomposition components to datasets for i in range(1, n_comp+1): train['pca_' + str(i)] = pca2_results_train[:,i-1] test['pca_' + str(i)] = pca2_results_test[:, i-1]
# PCA on Test Set X_test_PCA = pca.transform(X_test) X_test_PCA = pd.DataFrame(data=X_test_PCA, index=X_test.index) X_test_PCA_inverse = pca.inverse_transform(X_test_PCA) X_test_PCA_inverse = pd.DataFrame(data=X_test_PCA_inverse, \ index=X_test.index) scatterPlot(X_test_PCA, y_test, "PCA") anomalyScoresPCA = anomalyScores(X_test, X_test_PCA_inverse) preds = plotResults(y_test, anomalyScoresPCA, True) # Independent Component Analysis on Test Set X_test_fastICA = fastICA.transform(X_test) X_test_fastICA = pd.DataFrame(data=X_test_fastICA, index=X_test.index) X_test_fastICA_inverse = fastICA.inverse_transform(X_test_fastICA) X_test_fastICA_inverse = pd.DataFrame(data=X_test_fastICA_inverse, \ index=X_test.index) scatterPlot(X_test_fastICA, y_test, "Independent Component Analysis") anomalyScoresFastICA = anomalyScores(X_test, X_test_fastICA_inverse) plotResults(y_test, anomalyScoresFastICA) X_test_miniBatchDictLearning = miniBatchDictLearning.transform(X_test) X_test_miniBatchDictLearning = \ pd.DataFrame(data=X_test_miniBatchDictLearning, index=X_test.index)
joblib.dump(sparse_pca_model, 'sparse_pca_model.pkl') joblib.dump(sparse_pca_X_new, 'sparse_pca_X_new.pkl') print sparse_pca_model kernel_pca = KernelPCA(n_components=50) kernel_pca_model = kernel_pca.fit(kernel_pca_data) kernel_X_new = kernel_pca.fit_transform(X) joblib.dump(kernel_pca_model, 'kernel_pca_model.pkl') joblib.dump(kernel_X_new, 'kernel_X_new.pkl') fast_ica = FastICA(n_components=None) fast_ica_start = time.time() fast_ica_model = fast_ica.fit(fast_ica_data) fast_ica_end = time.time() print 'fast_ica fit time', fast_ica_end - fast_ica_start fast_ica_X_new = fast_ica.transform(X) joblib.dump(fast_ica_model, 'fast_ica_model.pkl') joblib.dump(fast_ica_X_new, 'fast_ica_X_new.pkl') print fast_ica_model ''' nmf = NMF(n_components=None) nmf_start = time.time() #nmf_model = nmf.fit(nmf_data) nmf_X_new = nmf.fit_transform(X) nmf_end = time.time() print 'nmf fit time', nmf_end - nmf_start #joblib.dump(nmf_model, 'nmf_model.pkl') joblib.dump(nmf_X_new, 'nmf_X_new.pkl') print nmf_model '''
def main(mode): path = '/local/attale00/AFLW_ALL/' path_ea = '/local/attale00/AFLW_cropped/cropped3/' # fileNames = utils.getAllFiles(path_ea); # minr = 10000; # for f in fileNames: # im = cv2.imread(path_ea+f,-1) # if im.shape[0]!=40 or im.shape[1]!=120: # print f # print im.shape # minr = minr if im.shape[0]>= minr else im.shape[0] # # print minr # labs=utils.parseLabelFiles(path+'/labels/labels','mouth_opening',fileNames,cutoffSeq='.png',suffix='_face0.labels') testSet = fg.dataContainer(labs) testSet.targetNum=map(utils.mapMouthLabels2Two,testSet.target) roi=(0,37,0,115) roi=None #roi=(44,84,88,168) # eM=np.load('/home/attale00/Desktop/mouthMask.npy') # m=cv2.resize(np.uint8(eM),(256,256)); # strel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(3,3)) # dil = cv2.dilate(m,strel) # # m=dil>0; #X=fg.getAllImagesFlat(path_ea,testSet.fileNames,(40,120),roi=roi) # perform ICA names_open = [] names_closed = [] for i,f in enumerate(testSet.fileNames): if testSet.targetNum[i] == 0: names_closed.append(f) elif testSet.targetNum[i] == 1: names_open.append(f) Xopen = fg.getAllImagesFlat(path_ea,names_open,(40,120)) XClosed = fg.getAllImagesFlat(path_ea,names_closed,(40,120)) if mode not in ['s','v']: icaopen = FastICA(n_components=100,whiten=True) icaopen.fit(Xopen) meanIopen=np.mean(Xopen,axis=0) X1open=Xopen-meanIopen dataopen=icaopen.transform(X1open) filtersopen=icaopen.components_ plottingUtils.showICAComponents(filtersopen,(40,120),4,4) icaclosed = FastICA(n_components=100,whiten=True) icaclosed.fit(XClosed) meanIclosed=np.mean(XClosed,axis=0) X1closed=XClosed-meanIclosed dataclosed=icaclosed.transform(X1closed) filtersclosed=icaclosed.components_ plottingUtils.showICAComponents(filtersclosed,(40,120),4,4) plt.show() elif mode in ['s','v']: W=np.load('/home/attale00/Desktop/classifiers/patches/filterMP1.npy') m=np.load('/home/attale00/Desktop/classifiers/patches/meanIMP1.npy') X1=X-m data=np.dot(X1,W.T) for i in range(len(fileNames)): testSet.data[i].extend(data[i,:]) strel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(3,3)) #fg.getHogFeature(testSet,roi,path=path_ea,ending='.png',extraMask = None,orientations = 5, cells_per_block=(3,3),pixels_per_cell=(24,8),maskFromAlpha=False) #fg.getColorHistogram(testSet,roi,path=path_ea,ending='.png',colorspace='lab',bins=20) #pca # n_samples, n_features = X.shape # # mean_ = np.mean(X, axis=0) # X -= mean_ # U, S, V = linalg.svd(X) # explained_variance_ = (S ** 2) / n_samples # explained_variance_ratio_ = (explained_variance_ /explained_variance_.sum()) # K=V / S[:, np.newaxis] * np.sqrt(n_samples) # filters=K[:100] # data=np.dot(X,filters.T) rf=classifierUtils.standardRF(max_features = 27,min_split=13,max_depth=40) #rf = svm.NuSVC() #rf = linear_model.SGDClassifier(loss='perceptron', eta0=1, learning_rate='constant', penalty=None) if mode in ['s','v']: print 'Classifying with loaded classifier' _classifyWithOld(path,testSet,mode) elif mode in ['c']: print 'cross validation of data' classifierUtils.dissectedCV(rf,testSet) elif mode in ['save']: print 'saving new classifier' _saveRF(testSet,rf,filters=filters,meanI=meanI) else: print 'not doing anything'
X = pd.read_csv('stroop_data_698_698.csv', header=None) y = np.vstack((np.ones((698, 1)), np.zeros((698, 1)))) rng = np.random.RandomState(4) X_, X_test, y_, y_test = train_test_split(X, y, test_size=0.2, random_state=rng) X_train, X_cros, y_train, y_cros = train_test_split(X_, y_, test_size=0.25, random_state=rng) ica = FastICA(random_state=rng, n_components=50, max_iter=10000) X_ica = ica.fit(X_train).transform(X_train) X_ica /= X_ica.std(axis=0) # Preparing crossvalidation data for each component X_cros_ica = ica.transform(X_cros) X_cros_ica /= X_cros_ica.std(axis=0) components = [] logreg = linear_model.LogisticRegression(C=1e5) for i in range(0, 30): target_component = X_ica[:, i].reshape(X_ica.shape[0], 1) logreg.fit(target_component, y_train.ravel()) X_cros_final = X_cros_ica[:, i].reshape(X_cros_ica.shape[0], 1)
def test_fastica_simple(add_noise, global_random_seed, global_dtype): # Test the FastICA algorithm on very simple data. rng = np.random.RandomState(global_random_seed) n_samples = 1000 # Generate two sources: s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 s2 = stats.t.rvs(1, size=n_samples, random_state=global_random_seed) s = np.c_[s1, s2].T center_and_norm(s) s = s.astype(global_dtype) s1, s2 = s # Mixing angle phi = 0.6 mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) mixing = mixing.astype(global_dtype) m = np.dot(mixing, s) if add_noise: m += 0.1 * rng.randn(2, 1000) center_and_norm(m) # function as fun arg def g_test(x): return x**3, (3 * x**2).mean(axis=-1) algos = ["parallel", "deflation"] nls = ["logcosh", "exp", "cube", g_test] whitening = ["arbitrary-variance", "unit-variance", False] for algo, nl, whiten in itertools.product(algos, nls, whitening): if whiten: k_, mixing_, s_ = fastica( m.T, fun=nl, whiten=whiten, algorithm=algo, random_state=rng ) with pytest.raises(ValueError): fastica(m.T, fun=np.tanh, whiten=whiten, algorithm=algo) else: pca = PCA(n_components=2, whiten=True, random_state=rng) X = pca.fit_transform(m.T) k_, mixing_, s_ = fastica( X, fun=nl, algorithm=algo, whiten=False, random_state=rng ) with pytest.raises(ValueError): fastica(X, fun=np.tanh, algorithm=algo) s_ = s_.T # Check that the mixing model described in the docstring holds: if whiten: # XXX: exact reconstruction to standard relative tolerance is not # possible. This is probably expected when add_noise is True but we # also need a non-trivial atol in float32 when add_noise is False. # # Note that the 2 sources are non-Gaussian in this test. atol = 1e-5 if global_dtype == np.float32 else 0 assert_allclose(np.dot(np.dot(mixing_, k_), m), s_, atol=atol) center_and_norm(s_) s1_, s2_ = s_ # Check to see if the sources have been estimated # in the wrong order if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)): s2_, s1_ = s_ s1_ *= np.sign(np.dot(s1_, s1)) s2_ *= np.sign(np.dot(s2_, s2)) # Check that we have estimated the original sources if not add_noise: assert_allclose(np.dot(s1_, s1) / n_samples, 1, atol=1e-2) assert_allclose(np.dot(s2_, s2) / n_samples, 1, atol=1e-2) else: assert_allclose(np.dot(s1_, s1) / n_samples, 1, atol=1e-1) assert_allclose(np.dot(s2_, s2) / n_samples, 1, atol=1e-1) # Test FastICA class _, _, sources_fun = fastica( m.T, fun=nl, algorithm=algo, random_state=global_random_seed ) ica = FastICA(fun=nl, algorithm=algo, random_state=global_random_seed) sources = ica.fit_transform(m.T) assert ica.components_.shape == (2, 2) assert sources.shape == (1000, 2) assert_allclose(sources_fun, sources) assert_allclose(sources, ica.transform(m.T)) assert ica.mixing_.shape == (2, 2) ica = FastICA(fun=np.tanh, algorithm=algo) with pytest.raises(ValueError): ica.fit(m.T)
class TICAEmbedding(TimeSeriesEmbedding): """Embed time series using tICA Properties ---------- time_lag : int The number of time steps to lag coordinates before embedding """ def __init__(self, *args, time_lag=10, **kwargs): super().__init__(*args, **kwargs) self.time_lag = time_lag if time_lag > 0: self.model = tICA(n_components=self.n_latent, lag_time=time_lag) elif time_lag == 0: self.model = FastICA(n_components=self.n_latent, random_state=self.random_state) else: raise ValueError( "Time delay parameter must be greater than or equal to zero.") def fit(self, X, y=None, subsample=None): """Fit the model with a time series X Parameters ---------- X : array-like, shape (n_timepoints, n_features) Training data, where n_timepoints is the number of timepoints and n_features is the number of features. y : None Ignored variable. subsample : int or None If set to an integer, a random number of timepoints is selected equal to that integer Returns ------- X_new : array-like, shape (n_timepoints, n_components) Transformed values. """ # Make hankel matrix from dataset Xs = standardize_ts(X) X_train = hankel_matrix(Xs, self.time_window) if subsample: self.train_indices, X_train = resample_dataset( X_train, subsample, random_state=self.random_state) if self.time_lag > 0: self.model.fit([np.reshape(X_train, (X_train.shape[0], -1))]) else: self.model.fit(np.reshape(X_train, (X_train.shape[0], -1))) def transform(self, X, y=None): X_test = hankel_matrix(standardize_ts(X), self.time_window) X_test = np.reshape(X_test, (X_test.shape[0], -1)) if self.time_lag > 0: X_new = self.model.transform([X_test])[0] else: X_new = self.model.transform(X_test) return X_new
start_time = time.time() # Load the data from income_data import X_train, X_test, y_train, y_test # Scale the data scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) X_toTransform = X_train # Reduce Dimensionality projection = ProjectionAlgorithm(n_components=29) X_transformed = projection.fit_transform(X_train) X_testTransformed = projection.transform(X_test) # Run em clustering with 2 clusters and plot cluster = GaussianMixture(random_state=0, n_components=91).fit(X_transformed) cluster_labels = cluster.predict(X_transformed) X_transformed = np.dot(X_transformed, np.transpose(cluster.means_)) X_testTransformed = np.dot(X_testTransformed, np.transpose(cluster.means_)) # Define the classifier nn = MLPClassifier(solver='lbfgs', random_state=1, alpha=0.005, hidden_layer_sizes=3) grid_params = {'alpha': [0.005], 'hidden_layer_sizes': [3]} clf = GridSearchCV(nn, param_grid=grid_params, cv=3)
class Preprocess: def __init__(self, pca_model=None, all_dat=None): if pca_model is not None: self.pca = joblib.load(pca_model) # try 'eco_full_pca.pkl' self.full_tab = pd.read_json("../data.json") self.full_tab["rem_nrg"] = self.full_tab.apply(lambda x: self.remaining_energy(x.score), axis=1) if all_dat is not None: self.all_dat = joblib.load(all_dat) # try 'all_games.pkl' drop = np.any(self.all_dat, axis=1) self.all_dat = self.all_dat[drop] self.full_tab = pd.read_json("../data.json")[drop] self.full_tab["rem_nrg"] = self.full_tab.apply(lambda x: self.remaining_energy(x.score), axis=1) self.proj = None # print os.system('pwd') @staticmethod def remaining_energy(consumption): max_batt = 0.55 # consumption = np.linspace(0,2000000) # print consumption if consumption == -1: return 0 else: return 100-(consumption/36000/max_batt) def totuple(self, a): try: return tuple(self.totuple(i) for i in a) except TypeError: return a def full_vec(self, pos, sig, size): series=np.zeros((size,), dtype=np.int) try: for i,x in enumerate(pos[:-1]): series[x:pos[i+1]] = sig[i] except Exception: pass #print series return series def get_json(self, file): with open(file) as json_data: data = json.load(json_data) self.dat=pd.DataFrame.from_dict(data['alluser_control']) self.dat["series"] = self.dat.apply(lambda x: self.totuple(self.full_vec(x['x'], x['sig'], 18160)), axis=1, raw=True) self.all_dat=np.empty((2391,18160)) for i,x in enumerate(self.dat.x): self.all_dat[i,:]=self.full_vec(x, self.dat.sig[i], 18160) joblib.dump(self.all_dat, '../all_games.pkl') def train_pca(self, ndim=30): # uses complete data-set # self.pca = TruncatedSVD(n_components=ndim) self.pca = FastICA(n_components=ndim) self.pca.fit(self.all_dat) joblib.dump(self.pca, '../eco_full_pca.pkl') # save for later importing def ready_player_one(self, place): # place must be less than 7. top6 = [78, 122, 166, 70, 67, 69] #best players m1, m2, m3, m4, m5, m6 = [self.full_tab.userid.values == i for i in top6] masks = [m1, m2, m3, m4, m5, m6] X = self.all_dat[masks[place-1]] y = self.full_tab["rem_nrg"].values[masks[place-1]] X_pca = self.pca.transform(X) X_pca = np.vstack((X_pca.T, self.full_tab["finaldrive"].values[masks[place-1]])).T return (X_pca, y) def ready_bad_player(self): # mask = [self.full_tab.userid.values == 1] # gets mediocre score (~12 plays mask = [self.full_tab.userid.values == 79] # gets zero score (~12 plays) X = self.all_dat[mask] y = self.full_tab["rem_nrg"].values[mask] X_pca = self.pca.transform(X) X_pca = np.vstack((X_pca.T, self.full_tab["finaldrive"].values[mask])).T return (X_pca, y) def prep_by_id(self, play_no): id_no = self.full_tab['userid'][self.full_tab['id'] == play_no].values[0] # print id_no mask_a = self.full_tab.userid.values == id_no mask_b = self.full_tab.id.values <= play_no mask = np.logical_and(mask_a, mask_b) X = self.all_dat[mask] y = self.full_tab["rem_nrg"].values[mask] X_pca = self.pca.transform(X) X_pca = np.vstack((X_pca.T, self.full_tab["finaldrive"].values[mask])).T return (X_pca, y)
s1 = np.sin(_x) # 第1位演讲者的声音 s2 = _x % (np.pi) * k1 * k2 + (np.pi - _x % (np.pi)) * k1 * k3 # 第2位演讲者的声音 x1 = 0.4 * s1 + 0.5 * s2 # 录音1 x2 = 1.2 * s1 - 0.3 * s2 # 录音2 plt.subplot(121) plt.plot(_x, s1, label='s1') plt.plot(_x, s2, label='s2') plt.legend() plt.subplot(122) plt.plot(_x, x1, label='x1') plt.plot(_x, x2, label='x2') plt.legend() plt.show() # 从合成信号x_1和x_2中分离出s_1和s_2这样的独立音源 X = np.stack((x1, x2), axis=1) # 将两个信号合并成矩阵 fica = FastICA(n_components=2) # 快速独立成分分析类实例化 fica.fit(X) X_ica = fica.transform(X) # 独立成分分析结果 print(X_ica.shape) # (1000, 2) plt.plot(_x, X_ica[:, 0], label='独立成分1') plt.plot(_x, X_ica[:, 1], label='独立成分2') plt.legend() plt.show()
def main(mode): path = '/local/attale00/AFLW_ALL/' path_ea = '/local/attale00/AFLW_cropped/cropped3/' # fileNames = utils.getAllFiles(path_ea); # minr = 10000; # for f in fileNames: # im = cv2.imread(path_ea+f,-1) # if im.shape[0]!=40 or im.shape[1]!=120: # print f # print im.shape # minr = minr if im.shape[0]>= minr else im.shape[0] # # print minr # labs=utils.parseLabelFiles(path+'/labels/labels','mouth_opening',fileNames,cutoffSeq='.png',suffix='_face0.labels') testSet = fg.dataContainer(labs) roi=(0,37,0,115) roi=None filters = None meanI = None #roi=(44,84,88,168) # eM=np.load('/home/attale00/Desktop/mouthMask.npy') # m=cv2.resize(np.uint8(eM),(256,256)); # strel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(3,3)) # dil = cv2.dilate(m,strel) # # m=dil>0; components = 150 X=fg.getAllImagesFlat(path_ea,testSet.fileNames,(40,120),roi=roi) # X=fg.getAllImagesFlat(path_ea,testSet.fileNames,(120,40),roi=roi,resizeFactor = .5) # # perform ICA if mode not in ['s','v']: ica = FastICA(n_components=components,whiten=True) ica.fit(X) meanI=np.mean(X,axis=0) X1=X-meanI data=ica.transform(X1) filters=ica.components_ elif mode in ['s','v']: W=np.load('/home/attale00/Desktop/classifiers/patches/filterMP1.npy') m=np.load('/home/attale00/Desktop/classifiers/patches/meanIMP1.npy') X1=X-m data=np.dot(X1,W.T) for i in range(len(fileNames)): testSet.data[i].extend(data[i,:]) #orientations = 2 #strel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,(3,3)) #fg.getHogFeature(testSet,roi,path=path_ea,ending='.png',extraMask = None,orientations = 5, cells_per_block=(3,3),pixels_per_cell=(24,8),maskFromAlpha=False) #fg.getColorHistogram(testSet,(0,40,40,80),path=path_ea,ending='.png',colorspace='lab',bins=bins) #fg.getImagePatchStat(testSet,path=path_ea,patchSize=(4,12)) #fg.getImagePatchStat(testSet,path='/local/attale00/AFLW_cropped/mouth_img_error/',patchSize=(4,12)) #pca # n_samples, n_features = X.shape # # mean_ = np.mean(X, axis=0) # X -= mean_ # U, S, V = linalg.svd(X) # explained_variance_ = (S ** 2) / n_samples # explained_variance_ratio_ = (explained_variance_ /explained_variance_.sum()) # K=V / S[:, np.newaxis] * np.sqrt(n_samples) # filters=K[:100] # data=np.dot(X,filters.T) testSet.targetNum=map(utils.mapMouthLabels2Two,testSet.target) rf=classifierUtils.standardRF(max_features = 23,min_split=15,max_depth=70,n_estimators=150) #rf=classifierUtils.standardRF(max_features = 5,min_split=12,max_depth=45) #rf = svm.NuSVC() #rf = linear_model.SGDClassifier(loss='perceptron', eta0=1, learning_rate='constant', penalty=None) if mode in ['s','v']: print 'Classifying with loaded classifier' _classifyWithOld(path,testSet,mode) elif mode in ['c']: print 'cross validation of data' rValues = classifierUtils.dissectedCV(rf,testSet) pickle.dump(rValues,open('patches_cv_ica_{}'.format(components),'w')) elif mode in ['save']: print 'saving new classifier' _saveRF(testSet,rf,filters=filters,meanI=meanI) else: print 'not doing anything'
X_train.shape[0]) t0 = time() # pca # pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) # ica pca = FastICA(n_components=n_components, whiten=True).fit(X_train) print "done in %0.3fs" % (time() - t0) #print 'First:'+str(pca.explained_variance_ratio_[0]) #print 'Second:'+str(pca.explained_variance_ratio_[1]) eigenfaces = pca.components_.reshape((n_components, h, w)) print "Projecting the input data on the eigenfaces orthonormal basis" t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) ############################################################################### # Train a SVM classification model print "Fitting the classifier to the training set" t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto' clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = clf.fit(X_train_pca, y_train)
def test_fastica_simple(add_noise, seed): # Test the FastICA algorithm on very simple data. rng = np.random.RandomState(seed) # scipy.stats uses the global RNG: n_samples = 1000 # Generate two sources: s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 s2 = stats.t.rvs(1, size=n_samples) s = np.c_[s1, s2].T center_and_norm(s) s1, s2 = s # Mixing angle phi = 0.6 mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) m = np.dot(mixing, s) if add_noise: m += 0.1 * rng.randn(2, 1000) center_and_norm(m) # function as fun arg def g_test(x): return x ** 3, (3 * x ** 2).mean(axis=-1) algos = ['parallel', 'deflation'] nls = ['logcosh', 'exp', 'cube', g_test] whitening = [True, False] for algo, nl, whiten in itertools.product(algos, nls, whitening): if whiten: k_, mixing_, s_ = fastica(m.T, fun=nl, algorithm=algo, random_state=rng) assert_raises(ValueError, fastica, m.T, fun=np.tanh, algorithm=algo) else: pca = PCA(n_components=2, whiten=True, random_state=rng) X = pca.fit_transform(m.T) k_, mixing_, s_ = fastica(X, fun=nl, algorithm=algo, whiten=False, random_state=rng) assert_raises(ValueError, fastica, X, fun=np.tanh, algorithm=algo) s_ = s_.T # Check that the mixing model described in the docstring holds: if whiten: assert_almost_equal(s_, np.dot(np.dot(mixing_, k_), m)) center_and_norm(s_) s1_, s2_ = s_ # Check to see if the sources have been estimated # in the wrong order if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)): s2_, s1_ = s_ s1_ *= np.sign(np.dot(s1_, s1)) s2_ *= np.sign(np.dot(s2_, s2)) # Check that we have estimated the original sources if not add_noise: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2) else: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1) # Test FastICA class _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=seed) ica = FastICA(fun=nl, algorithm=algo, random_state=seed) sources = ica.fit_transform(m.T) assert_equal(ica.components_.shape, (2, 2)) assert_equal(sources.shape, (1000, 2)) assert_array_almost_equal(sources_fun, sources) assert_array_almost_equal(sources, ica.transform(m.T)) assert_equal(ica.mixing_.shape, (2, 2)) for fn in [np.tanh, "exp(-.5(x^2))"]: ica = FastICA(fun=fn, algorithm=algo) assert_raises(ValueError, ica.fit, m.T) assert_raises(TypeError, FastICA(fun=range(10)).fit, m.T)
def main(): # まずは生体情報のデータeta_data, beta_dataを取り出す ------------------------------------- # データがあるパスに作業ディレクトリ変更 os.chdir(DATAPATH) # data格納用のデータフレームを準備 data_df = pd.DataFrame([]) for i_sub in range(len(FILENAME_LIST)): # データフレームにデータ格納(このデータはすでに標準化済み) mean_df = pd.read_excel(FILENAME_LIST[i_sub], sheet_name="mean").drop("Statistics", axis=1) max_df = pd.read_excel(FILENAME_LIST[i_sub], sheet_name="max").drop("Statistics", axis=1) min_df = pd.read_excel(FILENAME_LIST[i_sub], sheet_name="min").drop("Statistics", axis=1) std_df = pd.read_excel(FILENAME_LIST[i_sub], sheet_name="std").drop("Statistics", axis=1) # [平均,最大,最小,標準偏差]の順に横に並べる df = pd.concat([ mean_df, max_df.drop("Task", axis=1), min_df.drop("Task", axis=1), std_df.drop("Task", axis=1) ], axis=1, sort=False) # 各被験者の結果を縦に連結(ついでに標準化する) data_df = data_df.append(df) # タスク番号の削除 data2_df = data_df.drop(["Task"], axis=1) data2_df.columns = COLUMNS48 # dataを列ごとに標準化(標準化の標準化) stan_data = scipy.stats.zscore(data2_df, axis=0) # データフレーム型に変換 stan_data_df = pd.DataFrame(stan_data, columns=COLUMNS48) # 粘性,剛性をそれぞれ取り出す eta_data_df = stan_data_df.iloc[:, [0, 6, 12, 18, 24, 30, 36, 42]] beta_data_df = stan_data_df.iloc[:, [1, 7, 13, 19, 25, 31, 37, 43]] eta_data = eta_data_df.values beta_data = beta_data_df.values # ------------------------------------------------------------------------- # 次に主観評価のデータq_stan_dataを取り出す ------------------------------------ os.chdir(DATAPATH2) # データがあるパスに作業ディレクトリ変更 # data格納用のデータフレームを準備 q_data_df = pd.DataFrame([]) for i_sub in range(len(FILENAME_LIST2)): # データフレームにデータ格納 q_df = pd.read_excel(FILENAME_LIST2[i_sub]) # 各被験者の結果を縦に連結(ついでに標準化する) q_data_df = q_data_df.append(arrange_data(q_df, i_sub)) # タスク番号, 刺激の種類の削除 q_data2_df = q_data_df.drop(["No", "Stimulation"], axis=1) # dataを列ごとに標準化(標準化の標準化) q_stan_data = scipy.stats.zscore(q_data2_df, axis=0) # ------------------------------------------------------------------------- # 刺激の種類のndarray配列を用意 odor = q_data_df["Stimulation"].values.tolist() odor = np.reshape(odor, (len(odor), 1)) # 刺激の種類 # PCAする(返り値はndarray) un_score, non_score = mypca2(q_stan_data, eta_data, beta_data, odor) # ICAする # データの準備 ica_data = np.vstack((un_score[:, [1, 2]], non_score[:, [1, 2]])) # FastICAで独立成分分析 ica = FastICA() ica.fit(ica_data) Uica = ica.components_.T Aica = ica.transform(ica_data).T Uica = Uica / np.sqrt((Uica**2).sum(axis=0)) un = Aica[:, 0:len(un_score)] non = Aica[:, len(un_score):] # グラフ描画 plt.figure(figsize=(5, 5)) plt.scatter(un[0], un[1], s=80, c=[0.4, 0.6, 0.9], alpha=0.8, linewidths="1", edgecolors=[0, 0, 0]) plt.scatter(non[0], non[1], s=80, c=[0.5, 0.5, 0.5], alpha=0.8, linewidths="1", edgecolors=[0, 0, 0]) plt.title("ICA scatter", fontsize=18) # グラフを表示する plt.tight_layout() # タイトルの被りを防ぐ plt.show() # 主観評価のPC1と,生体情報のICA結果で相関解析 # グラフ描画サイズを設定する plt.figure(figsize=(10, 5)) # 1つ目のグラフ plt.subplot(1, 2, 1) # プロットする plt.scatter(un_score[:, 0], un[0], s=80, c=[0.4, 0.6, 0.9], alpha=0.8, linewidths="1", edgecolors=[0, 0, 0]) plt.scatter(non_score[:, 0], non[0], s=80, c=[0.5, 0.5, 0.5], alpha=0.8, linewidths="1", edgecolors=[0, 0, 0]) plt.title("scatter", fontsize=18) plt.xlabel("Emotion_PC1", fontsize=18) plt.ylabel("ICA_1", fontsize=18) correlation_analysis(np.hstack((un_score[:, 0], non_score[:, 0])), np.hstack((un[0], non[0]))) # 2つ目のグラフ plt.subplot(1, 2, 2) # プロットする plt.scatter(un_score[:, 0], un[1], s=80, c=[0.4, 0.6, 0.9], alpha=0.8, linewidths="1", edgecolors=[0, 0, 0]) plt.scatter(non_score[:, 0], non[1], s=80, c=[0.5, 0.5, 0.5], alpha=0.8, linewidths="1", edgecolors=[0, 0, 0]) plt.title("scatter", fontsize=18) plt.xlabel("Emotion_PC1", fontsize=18) plt.ylabel("ICA_2", fontsize=18) correlation_analysis(np.hstack((un_score[:, 0], non_score[:, 0])), np.hstack((un[1], non[1]))) # グラフを表示する plt.tight_layout() # タイトルの被りを防ぐ plt.show() """