def test_initialization(): rng = np.random.RandomState(0) U_init = rng.randn(5, 3) V_init = rng.randn(3, 4) model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0, random_state=rng) model.fit(rng.randn(5, 4)) assert_array_equal(model.components_, V_init)
def test_fit_transform_tall(): rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng) # tall array spca_lars = SparsePCA(n_components=3, method="lars", random_state=rng) U1 = spca_lars.fit_transform(Y) spca_lasso = SparsePCA(n_components=3, method="cd", random_state=rng) U2 = spca_lasso.fit(Y).transform(Y) assert_array_almost_equal(U1, U2)
def test_transform_nan(): # Test that SparsePCA won't return NaN when there is 0 feature in all # samples. rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array Y[:, 0] = 0 estimator = SparsePCA(n_components=8) assert_false(np.any(np.isnan(estimator.fit_transform(Y))))
def sparse_pca(self): """ Runs PCA on view and returns projected view, the principle components, and explained variance. """ model = SparsePCA(n_components=param['components'], alpha=param['sparse_pca_alpha']) model.fit(self.view) return model.transform(self.view), model.components_
def sccodedirect(): "得到不带眼镜的RPCA结果" nglassmodel = np.load('nglassline.npy').astype('f') from sklearn.decomposition import SparsePCA learning = SparsePCA(500,verbose=True) learning.fit(nglassmodel) import cPickle cPickle.dump(learning,file('sparsepcadirect','wb'),-1)
def test_scaling_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng) spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=rng, normalize_components=True) results_train = spca_lars.fit_transform(Y) results_test = spca_lars.transform(Y[:10]) assert_allclose(results_train[0], results_test[0])
def do_sparse_pca(sparse_matrix): # from skikit learn http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.SparsePCA.html#sklearn.decomposition.SparsePCA dense_matrix = sparse_matrix.tobsr().toarray() # instantiate the spca with some parameters spca = SparsePCA(n_components=6, alpha=0.01, ridge_alpha=0.01, max_iter=1000, tol=1e-08, method='lars', n_jobs=1, U_init=None, V_init=None, verbose=False, random_state=None) # train the spca with our matrix spca.fit(dense_matrix) # return the components return spca.components_
def test_initialization(norm_comp): rng = np.random.RandomState(0) U_init = rng.randn(5, 3) V_init = rng.randn(3, 4) model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0, random_state=rng, normalize_components=norm_comp) model.fit(rng.randn(5, 4)) if norm_comp: assert_allclose(model.components_, V_init / np.linalg.norm(V_init, axis=1)[:, None]) else: assert_allclose(model.components_, V_init)
def test_correct_shapes(): rng = np.random.RandomState(0) X = rng.randn(12, 10) spca = SparsePCA(n_components=8, random_state=rng) U = spca.fit_transform(X) assert_equal(spca.components_.shape, (8, 10)) assert_equal(U.shape, (12, 8)) # test overcomplete decomposition spca = SparsePCA(n_components=13, random_state=rng) U = spca.fit_transform(X) assert_equal(spca.components_.shape, (13, 10)) assert_equal(U.shape, (12, 13))
def __init__(self, num_components=10, catalog_name='unknown', alpha = 0.1, ridge_alpha = 0.01, max_iter = 2000, tol = 1e-9, n_jobs = 1, random_state = None): self._decomposition = 'Sparse PCA' self._num_components = num_components self._catalog_name = catalog_name self._alpha = alpha self._ridge_alpha = ridge_alpha self._n_jobs = n_jobs self._max_iter = max_iter self._tol = tol self._random_state = random_state self._SPCA = SparsePCA(n_components=self._num_components, alpha = self._alpha, ridge_alpha = self._ridge_alpha, n_jobs = self._n_jobs, max_iter = self._max_iter, tol = self._tol, random_state = self._random_state)
def test_pca_vs_spca(): rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng) Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2, normalize_components=True) pca = PCA(n_components=2) pca.fit(Y) spca.fit(Y) results_test_pca = pca.transform(Z) results_test_spca = spca.transform(Z) assert_allclose(np.abs(spca.components_.dot(pca.components_.T)), np.eye(2), atol=1e-5) results_test_pca *= np.sign(results_test_pca[0, :]) results_test_spca *= np.sign(results_test_spca[0, :]) assert_allclose(results_test_pca, results_test_spca)
def spca(data, num_components=None, alpha=1): # creates a matrix with sparse principal component analysis # build matrix with all data data = [d.flatten() for d in data if not any(isnan(d))] datamatrix = row_stack(data) # center data cdata = datamatrix - mean(datamatrix, axis=0) if num_components is None: num_components = cdata.shape[0] # do spca on matrix spca = SparsePCA(n_components=num_components, alpha=alpha) spca.fit(cdata) # normalize components components = spca.components_.T for r in xrange(0,components.shape[1]): compnorm = numpy.apply_along_axis(numpy.linalg.norm, 0, components[:,r]) if not compnorm == 0: components[:,r] /= compnorm components = components.T # calc adjusted explained variance from "Sparse Principal Component Analysis" by Zou, Hastie, Tibshirani spca.components_ = components #nuz = spca.transform(cdata).T nuz = ridge_regression(spca.components_.T, cdata.T, 0.01, solver='dense_cholesky').T #nuz = dot(components, cdata.T) q,r = qr(nuz.T) cumulative_var = [] for i in range(1,num_components+1): cumulative_var.append(trace(r[0:i,]*r[0:i,])) explained_var = [math.sqrt(cumulative_var[0])] for i in range(1,num_components): explained_var.append(math.sqrt(cumulative_var[i])-math.sqrt(cumulative_var[i-1])) order = numpy.argsort(explained_var)[::-1] components = numpy.take(components,order,axis=0) evars = numpy.take(explained_var,order).tolist() #evars = numpy.take(explained_var,order) #order2 = [0,1,2,4,5,7,12,19] #components = numpy.take(components,order2,axis=0) #evars = numpy.take(evars,order2).tolist() return components, evars
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0).fit(Y) U2 = spca.transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_fit_transform_tall(norm_comp): rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng) # tall array spca_lars = SparsePCA(n_components=3, method='lars', random_state=rng, normalize_components=norm_comp) U1 = spca_lars.fit_transform(Y) spca_lasso = SparsePCA(n_components=3, method='cd', random_state=rng, normalize_components=norm_comp) U2 = spca_lasso.fit(Y).transform(Y) assert_array_almost_equal(U1, U2)
def process_dim_reduction(method='pca', n_dim=10): """ Default linear dimensionality reduction method. For each method, return a BaseEstimator instance corresponding to the method given as input. Attributes ------- method: str, default to 'pca' Method used for dimensionality reduction. Implemented: 'pca', 'ica', 'fa' (Factor Analysis), 'nmf' (Non-negative matrix factorisation), 'sparsepca' (Sparse PCA). n_dim: int, default to 10 Number of domain-specific factors to compute. Return values ------- Classifier, i.e. BaseEstimator instance """ if method.lower() == 'pca': clf = PCA(n_components=n_dim) elif method.lower() == 'ica': print('ICA') clf = FastICA(n_components=n_dim) elif method.lower() == 'fa': clf = FactorAnalysis(n_components=n_dim) elif method.lower() == 'nmf': clf = NMF(n_components=n_dim) elif method.lower() == 'sparsepca': clf = SparsePCA(n_components=n_dim, alpha=10., tol=1e-4, verbose=10, n_jobs=1) elif method.lower() == 'pls': clf = PLS(n_components=n_dim) else: raise NameError('%s is not an implemented method' % (method)) return clf
def new(stop_words=[],decomposition='SVD',n_components=5): # Prepare vectoriser engines idf = TfidfVectorizer( ngram_range=(1,3), #Unigram,bigram,& trigram stop_words=stop_words ) # Prepare normaliser norm = Normalizer(norm='max') print(colored('Texthasher model created','yellow')) # Prepare dimensionality reduction if decomposition and n_components: if decomposition=='LDA': # Results in Non-negative matrix reducer = LatentDirichletAllocation( # TFIDF --> Topic term n_topics=n_components, max_doc_update_iter=20, max_iter=8 ) return [idf,norm,reducer] elif decomposition=='SVD': reducer = TruncatedSVD( # Best for small dataset, n_components, # nightmare for large dataset n_iter=8) # Damn slow return [idf,norm,reducer] elif decomposition=='PCA': # When using IPCA, remember to always keep: # n_samples > n_components > batch_size # reducer = IncrementalPCA(n_components) # Sparse -> Dense greedily consumes large amount of mem # to_dense = SparseToDense() # return [idf,norm,to_dense,reducer] reducer = SparsePCA(n_components) return [idf,norm,reducer] return [idf,norm] else: return [idf,norm]
def test_fit_transform_parallel(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0).fit(Y) U2 = spca.transform(Y) assert not np.all(spca_lars.components_ == 0) assert_array_almost_equal(U1, U2)
def reduce_dimension(name, x, n_components): algorithms = { 'factor_analysis': FactorAnalysis(random_state=0, n_components=n_components), 'fast_ica': FastICA(random_state=0, n_components=n_components), 'nmf': Pipeline([('min_max', MinMaxScaler()), ('nmf', NMF(random_state=0, n_components=n_components))]), 'pca': PCA(random_state=0, n_components=n_components), 'sparse_pca': SparsePCA(random_state=0, n_components=n_components), 'truncated_svd': TruncatedSVD(random_state=0, n_components=n_components) } return Pipeline([(name, algorithms.get(name)), ('min_max', MinMaxScaler())]).fit_transform(x)
def test_fit_transform(norm_comp): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0, normalize_components=norm_comp) spca_lars.fit(Y) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha, normalize_components=norm_comp) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_) # Test that deprecated ridge_alpha parameter throws warning warning_msg = "The ridge_alpha parameter on transform()" assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform, Y, ridge_alpha=0.01) assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform, Y, ridge_alpha=None)
def main(): parser = argparse.ArgumentParser(description='py, Dirout, EUDT_txt, num_case') parser.add_argument('--Dirout', '-i1', default='F:/SPCA_debug/result', help='Dirout_path') parser.add_argument('--EUDT_text', '-i2', default='F:/SPCA_debug/input.txt', help='EUDT(training_data_list)(.txt)') parser.add_argument('--num_case', '-i3', default='50', help='num of training data(int)', type=int) args = parser.parse_args() case_size = int(512 * 512 * 1) # load data print('load data') case = np.zeros((args.num_case, case_size)) with open(args.EUDT_text, 'rt') as f: i = 0 for line in f: if i >= args.num_case: break line = line.split() case[i, :] = IO.read_raw(line[0], dtype='double') i += 1 print(case.shape) # Prepare for pca print('process pca') spca = SparsePCA(n_components=args.num_case - 1) # Do pca and map to Principal component spca.fit(case) # # mean_vector # mean_vector = pca.mean_ # components U = spca.components_ for i in range(0, args.num_case - 1): IO.write_raw(U[i, :].copy(), args.Dirout + '/vect_' + str(i).zfill(4) + '.vect') # PCs
def make_methods_plot(labeled=True): file_out = "methods" n_components = 2 n_neighbors = 20 methods = { "LLE": LocallyLinearEmbedding(n_neighbors=n_neighbors), # "Spectral NN": SpectralEmbedding(affinity="nearest_neighbors"), # "Spectral RBF": SpectralEmbedding(affinity="rbf"), "PCA": PCA(n_components=n_components), "IncrementalPCA": IncrementalPCA(n_components=n_components), "KernelPCA": KernelPCA(n_components=n_components), "SparsePCA": SparsePCA(n_components=n_components), "TruncatedSVD": TruncatedSVD(n_components=n_components), # f"TSNE(perplexity = {n_neighbors})": TSNE(perplexity=n_neighbors), } if labeled: make_plot_labeled(methods, file_out=f"{file_out}_labeled") else: make_plot_unlabeled(methods, file_out=f"{file_out}_unlabeled")
def test_fit_transform_parallel(norm_comp): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0, normalize_components=norm_comp) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0, normalize_components=norm_comp).fit(Y) U2 = spca.transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2)
def dim_reduction_method(self): """ select dimensionality reduction method """ if self.dim_reduction=='pca': return PCA() elif self.dim_reduction=='factor-analysis': return FactorAnalysis() elif self.dim_reduction=='fast-ica': return FastICA() elif self.dim_reduction=='kernel-pca': return KernelPCA() elif self.dim_reduction=='sparse-pca': return SparsePCA() elif self.dim_reduction=='truncated-svd': return TruncatedSVD() elif self.dim_reduction!=None: raise ValueError('%s is not a supported dimensionality reduction method. Valid inputs are: \ "pca","factor-analysis","fast-ica,"kernel-pca","sparse-pca","truncated-svd".' %(self.dim_reduction))
def get_dim_reds_scikit(pct_features): n_components = max(int(pct_features * num_features), 1) return [ LinearDiscriminantAnalysis(n_components=n_components), TruncatedSVD(n_components=n_components), #SparseCoder(n_components=n_components), DictionaryLearning(n_components=n_components), FactorAnalysis(n_components=n_components), SparsePCA(n_components=n_components), NMF(n_components=n_components), PCA(n_components=n_components), RandomizedPCA(n_components=n_components), KernelPCA(kernel="linear", n_components=n_components), KernelPCA(kernel="poly", n_components=n_components), KernelPCA(kernel="rbf", n_components=n_components), KernelPCA(kernel="sigmoid", n_components=n_components), KernelPCA(kernel="cosine", n_components=n_components), Isomap(n_components=n_components), LocallyLinearEmbedding(n_components=n_components, eigen_solver='auto', method='standard'), LocallyLinearEmbedding(n_neighbors=n_components, n_components=n_components, eigen_solver='auto', method='modified'), LocallyLinearEmbedding(n_neighbors=n_components, n_components=n_components, eigen_solver='auto', method='ltsa'), SpectralEmbedding(n_components=n_components) ]
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_) # Test that deprecated ridge_alpha parameter throws warning warning_msg = "The ridge_alpha parameter on transform()" assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform, Y, ridge_alpha=0.01) assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform, Y, ridge_alpha=None)
def WeightsEstimatedFromSparsePCA(ret_port, n_com=25): tf = SparsePCA(n_components=n_com) # , random_state=0) tf.fit(ret_port.agg(lambda x: x - x.mean()).fillna(0.0)) # 注意量级 tf.transform( ret_port.fillna(0.0) ) # .apply(lambda x:x.where(~x.isnull(),x.mean()),axis=0))#,index=date_investing[date_investing<'2019-12']) # 根据组合的组合的平均收益,调整组合的符号 weights = pd.DataFrame(tf.components_, columns=signal_names.split(',')).T ret_transformed_port = (cov_chara_ret.fillna(0.0) @ weights).replace( 0.0, np.nan) for c in weights.columns: weights[c] = weights[c] * np.sign( ret_transformed_port[c].mean()) / np.abs(weights[c]).sum() ret_transformed_port = (cov_chara_ret.fillna(0.0) @ weights).replace( 0.0, np.nan) # 按t值选,还是按SR选择 select_port = np.abs( PortfolioAnalysis(ret_transformed_port.dropna( how='all', axis=1))).T.sort_values(by='SR', ascending=False).index[:int(n_com * 0.67)] for p in select_port: weights[p] *= np.sign(ret_transformed_port[p].mean()) return weights[select_port]
def WeightsEstimatedFromSparsePCAWithWeightedCovariance(ret_p, n_com=30): ret_port = ret_p.dropna(how='all', axis=1) tf = SparsePCA(n_components=n_com) # , random_state=0) cov_matrix = WeightedCovariance(ret_port) tf.fit(cov_matrix) # 注意量级 tf.transform( ret_port.fillna(0.0) ) # .apply(lambda x:x.where(~x.isnull(),x.mean()),axis=0))#,index=date_investing[date_investing<'2019-12']) # 根据组合的组合的平均收益,调整组合的符号 weights = pd.DataFrame(tf.components_, columns=cov_matrix.columns).T ret_transformed_port = (ret_port.fillna(0.0) @ weights).replace( 0.0, np.nan) for c in ret_transformed_port.columns: weights[c] = weights[c] * np.sign( ret_transformed_port[c].mean()) / np.abs(weights[c]).sum() ret_transformed_port = (ret_port.fillna(0.0) @ weights).replace( 0.0, np.nan) # 按t值选,还是按SR选择 select_port = np.abs( PortfolioAnalysis(ret_transformed_port)).T.sort_values( by='SR', ascending=False).index for p in select_port: weights[p] *= np.sign(ret_transformed_port[p].mean()) return weights[select_port]
matrix_KPCA, clusters) # print ("get KPCA BF matrix") # print (matrix_KPCA_BF) # print ("get KPCA mean matrix") # print (matrix_KPCA_mean) matrix_KPCA_BF.to_csv(gl.get_value("outputFile") + "_KPCA_BF.txt", sep='\t', header=True, index=True) matrix_KPCA_mean.to_csv(gl.get_value("outputFile") + "_KPCA_mean.txt", sep='\t', header=True, index=True) if gl.get_value("SPCA_Flag"): spca = SparsePCA(n_components=gl.get_value("SPCA_n_components")) spca.fit(wholeData) expre_SPCA = spca.transform(expre_data) # print ("get SPCA data") matrix_SPCA = Methods.get_matrix_dist( data=expre_SPCA, lab=lab, clusters=clusters, average_number=gl.get_value("SPCA_AvgNum"), caculation_number=gl.get_value("SPCA_CalNum")) # print ("get SPCA matrix") matrix_SPCA_BF = Methods.disMatrix_to_bfMatrix(matrix_SPCA, clusters) matrix_SPCA_mean = Methods.disMatrix_to_meanMatrix( matrix_SPCA, clusters) # print ("get SPCA BF matrix") # print (matrix_SPCA_BF)
plt.scatter(X[y == label, 0], X[y == label, 1], color=color, label=class_name) plt.title(title) plt.legend(loc='best') # 转换前的可视化, 只显示前两维度的数据 plt.figure(1) plot_func('origin data') # KernelPCA 是非线性降维, LDA 只能用于分类降维 # ICA 通常不用于降低维度,而是用于分离叠加信号 models_list = [('LDA', LinearDiscriminantAnalysis(n_components=2)), ('PCA', PCA(n_components=2, random_state=0)), ('PCARand', PCA(n_components=2, random_state=0, svd_solver='randomized')), ('IncrementalPCA', IncrementalPCA(n_components=2, batch_size=10, whiten=True)), ('FactorAnalysis', FactorAnalysis(n_components=2, max_iter=500)), ('FastICA', FastICA(n_components=2, random_state=0)), ('KernelPCA', KernelPCA(n_components=2, random_state=0, kernel='rbf')), ('SparsePCA', SparsePCA(n_components=2, random_state=0, verbose=True)), ('MiniBatchSparsePCA', MiniBatchSparsePCA(n_components=2, verbose=True, batch_size=10, random_state=0)), ('DictionaryLearning', DictionaryLearning(n_components=2, verbose=True, random_state=0)), ('MiniBatchDictionaryLearning', MiniBatchDictionaryLearning(n_components=2, batch_size=5, random_state=0, alpha=0.1))] model = namedtuple('models', ['mod_name', 'mod_ins']) for i in range(len(models_list)): mod = model(*models_list[i]) if mod.mod_name == 'LDA': mod.mod_ins.fit(X, y) X_new = mod.mod_ins.transform(X) else: X_new = mod.mod_ins.fit_transform(X) plt.figure(i + 2) plot_func(mod.mod_name + ' transformed data')
index=validation_index) scatterPlot(X_train_incrementalPCA, y_train, "Incremental PCA") # In[ ]: # Sparse PCA from sklearn.decomposition import SparsePCA n_components = 100 alpha = 0.0001 random_state = 2020 n_jobs = -1 sparsePCA = SparsePCA(n_components=n_components, alpha=alpha, random_state=random_state, n_jobs=n_jobs) sparsePCA.fit(X_train.loc[:10000, :]) X_train_sparsePCA = sparsePCA.transform(X_train) X_train_sparsePCA = pd.DataFrame(data=X_train_sparsePCA, index=train_index) X_validation_sparsePCA = sparsePCA.transform(X_validation) X_validation_sparsePCA = pd.DataFrame(data=X_validation_sparsePCA, index=validation_index) scatterPlot(X_train_sparsePCA, y_train, "Sparse PCA") # In[ ]: # Kernel PCA
def Var_Select(orgdata, k, alphaMax=10, alphastep=0.2): """ orgdata-需要信息压缩的数据框 k-预期最大需要保留的最大变量个数,实际保留数量不能多于这个数值 alphaMax-SparsePCA算法惩罚项的最大值,一般要到5才会取得比较理想的结果 alphastep-SparsePCA算法惩罚项递增的步长 """ #step1:当数据量过大时,为了减少不必要的耗时 if orgdata.iloc[:, 1].count() > 5000: data = orgdata.sample(5000) else: data = orgdata #step2:引入所需要的包,并且对数据进行标准化 from sklearn import preprocessing import pandas as pd import numpy as np from sklearn.decomposition import SparsePCA #from functools import reduce data = preprocessing.scale(data) n_components = k #pca_n = list() #step3:进行SparsePCA计算,选择合适的惩罚项alpha,当恰巧每个原始变量只在一个主成分上有权重时,停止循环 for i in np.arange(0.1, alphaMax, alphastep): pca_model = SparsePCA(n_components=n_components, alpha=i) pca_model.fit(data) pca = pd.DataFrame(pca_model.components_).T n = data.shape[1] - sum(sum(np.array(pca != 0))) ####计算系数不为0的数量 if n == 0: global best_alpha best_alpha = i break #step4:根据上一步得到的惩罚项的取值,估计SparsePCA,并得到稀疏主成分得分 pca_model = SparsePCA(n_components=n_components, alpha=best_alpha) pca_model.fit(data) pca = pd.DataFrame(pca_model.components_).T data = pd.DataFrame(data) score = pd.DataFrame(pca_model.fit_transform(data)) #step6:计算原始变量与主成分之间的1-R方值 r = [] R_square = [] for xk in range(data.shape[1]): # xk输入变量个数 for paj in range(n_components): # paj主成分个数 r.append( abs(np.corrcoef(data.iloc[:, xk], score.iloc[:, paj])[0, 1])) r_max1 = max(r) r.remove(r_max1) r.append(-2) r_max2 = max(r) R_square.append((1 - r_max1**2) / (1 - r_max2**2)) R_square = abs( pd.DataFrame( np.array(R_square).reshape((data.shape[1], n_components)))) var_list = [] #print(R_square) #step7:每个主成分中,选出原始变量的1-R方值最小的。 for i in range(n_components): vmin = R_square[i].min() #print(R_square[i]) #print(vmin) #print(R_square[R_square[i] == min][i]) var_list.append(R_square[R_square[i] == vmin][i].index) news_ids = [] for id in var_list: if id not in news_ids: news_ids.append(id) print(news_ids) data_vc = orgdata.iloc[:, np.array(news_ids).reshape(len(news_ids))] return data_vc
def sparse_pca(self, n_components, alpha): pca = SparsePCA(n_components = 3, alpha = alpha) self.X = pca.fit_transform(self.X) self.df_c = pd.DataFrame(pca.components_.T, index = self.crimes, columns = [1,2,3]) return self.df_c
#csv = "c:/iris44.csv" # wikipedia Iris_flower_data_set # 5.1,3.5,1.4,0.2 # ,Iris-setosa ... N = 40 K = 450000 seed = 1 exec "\n".join( sys.argv[1:] ) # N= ... np.random.seed(seed) np.set_printoptions( 1, threshold=100, suppress=True ) # .1f try: A = np.genfromtxt( csv, delimiter="," ) N, K = A.shape except IOError: print('error') A = np.random.normal( size=(N, K) ) # gen correlated ? print(len(A[1]), N, K) print "A:", A #pca = PCA(n_components=4) pca = SparsePCA(n_components=None, alpha=1, ridge_alpha=0.01, max_iter=1000, tol=1e-08, method='lars', n_jobs=1, U_init=None, V_init=None, verbose=False, random_state=None) scores=pca.fit_transform(A) pca_variance = pca.explained_variance_ratio_ coeff = pca.components_ #A1=pca.inverse_transform(coeff) print(pca_variance) print("coeff",coeff) #score = pca.transform(A) print("score",scores) #print A1
fp_mean.append(0) fn_mean.append(0) f1_mean.append(0) n = 1 for FrameRange_ind in range(len(offset_list)): for sparsePCA_alpha_ind in sparsePCA_alpha: # for sparsePCA_ridge_alpha_ind in sparsePCA_ridge_alpha: # compute PCA ncomp = 5 offset = offset_list[FrameRange_ind] upto = upto_list[FrameRange_ind] # if ~upto: # upto = O.Shapes().shape[0] PCA_start = time.time() p = SparsePCA(n_components=ncomp, alpha=sparsePCA_alpha_ind, ridge_alpha=0.01) PCA_end = time.time() print("The " + str(n) + " PCA time: " + str(PCA_end-PCA_start)) Projection_start = time.time() scorePCA = p.fit_transform(O.Shapes()[offset:upto, :].T).T Projection_end = time.time() print("The " + str(n) + " Projection time: " + str(Projection_end-Projection_start)) # explained_variance_ratio = p.explained_variance_ratio_ plt.figure(1) plt.plot(p.components_.T) plt.legend(range(5)) plt.savefig("princomp/" + str(offset) + "to" + str(upto) + "_alpha" + str(sparsePCA_alpha_ind) + ".png", bbox_inches='tight') plt.clf() plt.figure(2) plt.scatter(scorePCA[0, :10000], scorePCA[1, :10000], s=4)
class SPCA(object): def __init__(self, n_components=None, alpha=1, ridge_alpha=0.01, max_iter=1000, tol=1e-8, method='lars', n_jobs=None, U_init=None, V_init=None, verbose=False, random_state=None, normalize_components='deprecated'): """ :param n_components: :param alpha: :param ridge_alpha: :param max_iter: :param tol: :param method: :param n_jobs: :param U_init: :param V_init: :param verbose: :param random_state: :param normalize_components: """ self.model = SparsePCA(n_components=n_components, alpha=alpha, ridge_alpha=ridge_alpha, max_iter=max_iter, tol=tol, method=method, n_jobs=n_jobs, U_init=U_init, V_init=V_init, verbose=verbose, random_state=random_state, normalize_components=normalize_components) def fit(self, x, y): self.model.fit(X=x, y=y) def transform(self, x): self.model.transform(X=x) def fit_transform(self, x, y=None): return self.model.fit_transform(X=x, y=y) def get_params(self): return self.model.get_params(deep=True) def set_params(self, **params): return self.model.set_params(**params) def get_attributes(self): components = self.model.components_ error = self.model.error_ n_iter = self.model.n_iter_ mean = self.model.mean_ return components, error, n_iter, mean
def apply_band_selection(technique, dataset, predictions, mode, n_components, df_column_entry_dict): if df_column_entry_dict is None: df_column_entry_dict = { } # couldn't care less, this is a lazy way to make all accesses work print("Dataset current shape: " + str(dataset.shape)) print_memory_metrics("before applying band selection method " + technique, df_column_entry_dict) from DeepHyperX.batch import PARAMETER_JSON parameterFile = open(PARAMETER_JSON, "r") import json data = json.load(parameterFile) parameterFile.close() if technique in ["IncrementalPCA"]: # requires special method dataset, _ = applyIncrementalPCA(dataset, n_components) elif technique in data["image_compression"]["extraction"]["techniques"]: extraction_object = None if technique == "PCA": from sklearn.decomposition import PCA """ HybridSN: Exploring 3D-2D CNN Feature Hierarchy for Hyperspectral Image Classification Source code used: https://github.com/gokriznastic/HybridSN/blob/master/Hybrid-Spectral-Net.ipynb Paper: https://arxiv.org/abs/1902.06701 Good parameters: 30 components for Indian Pines, 15 for Salinas and Pavia University """ extraction_object = PCA(n_components=n_components, whiten=True) elif technique == "KernelPCA": from sklearn.decomposition import KernelPCA extraction_object = KernelPCA(kernel="rbf", n_components=n_components, gamma=None, fit_inverse_transform=True, n_jobs=1) elif technique == "SparsePCA": """Sparse PCA uses the links between the ACP and the SVD to extract the main components by solving a lower-order matrix approximation problem.""" from sklearn.decomposition import SparsePCA extraction_object = SparsePCA(n_components=n_components, alpha=0.0001, n_jobs=-1) elif technique == "LDA": # only supervised is supported, y is required if mode != "supervised": print( "warning: mode other than supervised detected for lda, setting it to supervised...\n" ) mode = "supervised" # maximally n_classes - 1 columns, https://stackoverflow.com/questions/26963454/lda-ignoring-n-components from sklearn.discriminant_analysis import LinearDiscriminantAnalysis extraction_object = LinearDiscriminantAnalysis( n_components=n_components) elif technique == "SVD": from sklearn.decomposition import TruncatedSVD extraction_object = TruncatedSVD(n_components=n_components, algorithm='randomized', n_iter=5) elif technique == "GRP": from sklearn.random_projection import GaussianRandomProjection extraction_object = GaussianRandomProjection( n_components=n_components, eps=0.5) elif technique == "SRP": from sklearn.random_projection import SparseRandomProjection extraction_object = SparseRandomProjection( n_components=n_components, density='auto', eps=0.5, dense_output=False) elif technique == "MDS": """O(n^3), uses lots of memory for distance matrix (doesn't fit in 48GB), doesn't fit in GPU memory either, so basically unusable""" from sklearn.manifold import MDS extraction_object = MDS(n_components=n_components, n_init=12, max_iter=200, metric=True, n_jobs=16) elif technique == "MiniBatch": """takes too long""" from sklearn.decomposition import MiniBatchDictionaryLearning extraction_object = MiniBatchDictionaryLearning( n_components=n_components, batch_size=200, alpha=1, n_iter=1) elif technique == "LLE": # modified LLE requires n_neighbors >= n_components """execution takes 20 minutes or so, but it does work, just takes a long time""" from sklearn.manifold import LocallyLinearEmbedding extraction_object = LocallyLinearEmbedding( n_components=n_components, n_neighbors=100, method='modified', n_jobs=4) elif technique == "ICA": from sklearn.decomposition import FastICA extraction_object = FastICA(n_components=n_components, algorithm='parallel', whiten=True, max_iter=100) elif technique == "FactorAnalysis": from sklearn.decomposition import FactorAnalysis extraction_object = FactorAnalysis(n_components=n_components) #75 elif technique == "ISOMAP": from sklearn import manifold extraction_object = manifold.Isomap(n_neighbors=5, n_components=n_components, n_jobs=-1) elif technique == "t-SNE": # like PCA, but non-linear (pca is linear) from sklearn.manifold import TSNE extraction_object = TSNE(n_components=n_components, learning_rate=300, perplexity=30, early_exaggeration=12, init='random') elif technique == "UMAP": # install umap-learn for this to work import umap extraction_object = umap.UMAP(n_neighbors=50, min_dist=0.3, n_components=n_components) elif technique == "NMF": # https://www.kaggle.com/remidi/dimensionality-reduction-techniques from sklearn.decomposition import NMF extraction_object = NMF(n_components=n_components, init='nndsvdar', random_state=420) elif technique == "F*G": # super fast and nice from sklearn.cluster import FeatureAgglomeration extraction_object = FeatureAgglomeration(n_clusters=n_components, linkage='ward') else: raise ValueError("Unknown feature extraction technique: " + technique) start_mem_measurement() start = time.time() dataset, _ = applyFeatureExtraction( dataset, predictions, extraction_object, mode, merged=(len(dataset.shape) == 4 and len(predictions.shape) == 3)) time_elapse = time.time() - start event = 'applying band selection method (EXTRACTION) ' + technique formatted_time = str(timedelta(seconds=time_elapse)) df_column_entry_dict['Time measurement at ' + event + ' [s]'] = time_elapse print("\n" + event + " took " + formatted_time + " seconds\n") event = "after applying band selection method " + technique stop_mem_measurement(event, df_column_entry_dict) print_memory_metrics(event, df_column_entry_dict) elif technique in data["image_compression"]["selection"]["techniques"]: selection_object = None if technique == "RandomForest": # Random forests or random decision forests are an ensemble learning method for classification, regression and other # tasks that operates by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees.[1][2] Random decision forests correct for decision trees' habit of overfitting to their training set.[3]:587–588 https://en.wikipedia.org/wiki/Random_forest from sklearn.ensemble import RandomForestClassifier selection_object = RandomForestClassifier() elif technique == "LogisticRegression": from sklearn.linear_model import LogisticRegression selection_object = LogisticRegression() elif technique == "LinearRegression": from sklearn.linear_model import LinearRegression selection_object = LinearRegression() elif technique == "LightGBM": from lightgbm import LGBMClassifier selection_object = LGBMClassifier() else: raise ValueError("Unknown feature selection technique: " + technique) start_mem_measurement() start = time.time() dataset, _ = applyFeatureSelection( dataset, predictions, selection_object, n_components, mode, merged=(len(dataset.shape) == 4 and len(predictions.shape) == 3)) time_elapse = time.time() - start event = 'applying band selection method (SELECTION) ' + technique formatted_time = str(timedelta(seconds=time_elapse)) df_column_entry_dict['Time measurement at ' + event + ' [s]'] = time_elapse print("\n" + event + " took " + formatted_time + " seconds\n") event = "after applying band selection method " + technique stop_mem_measurement(event, df_column_entry_dict) print_memory_metrics(event, df_column_entry_dict) print("Dataset new shape: " + str(dataset.shape)) return dataset
#Sparse PCA analysis ################################ #Sparse Principal Components Analysis (SparsePCA) #SparsePCA """ Finds the set of sparse components that can optimally reconstruct the data. The amount of sparseness is controllable by the coefficient of the L1 penalty, given by the parameter alpha. """ from sklearn.decomposition import SparsePCA #SparsePCA(n_components=None, alpha=1, ridge_alpha=0.01, max_iter=1000, tol=1e-08, #method='lars', n_jobs=1, U_init=None, V_init=None, #verbose=False, random_state=None) #method : {‘lars’, ‘cd’} #alpha: higher value--sparser components spca = SparsePCA(method='lars') SPCA_OUTPUT = spca.fit(X_all_his_center) X_spca = spca.fit_transform(X_all_his_center) np.savetxt("D:/lly/2017MM/PHASE2/final_totoal/SPCA_MM_PCs.csv", SPCA_OUTPUT.components_, delimiter=",") #2d-visualization-SPCA-data projection in higher dimensional space fig = plt.figure() plt.plot(X_spca[reds, 0], X_spca[reds, 1], "ro", markersize=10) plt.plot(X_spca[blues, 0], X_spca[blues, 1], "b^", alpha=0.5) plt.plot(X_spca[greens, 0], X_spca[greens, 1], "g+") plt.legend('LWN') plt.title("newData under two PCs--SPCA") plt.xlabel("$1^{st}$ PC") plt.ylabel("$2^{nd}$ PC")
print(pca.explained_variance_ratio_) fig = plt.figure() fig.suptitle('PCA', fontsize=32) plt.plot(range(len(pca.explained_variance_ratio_)), pca.explained_variance_ratio_) plt.show() show_figure(fdata, labels, ulabs, 'PCA') # Sparse PCA print('Sparse PCA') from sklearn.decomposition import SparsePCA spca = SparsePCA(n_components=3) fdata = spca.fit_transform(authors) show_figure(fdata, labels, ulabs, 'Sparse PCA') # ISOMAP print('ISOMAP') from sklearn.manifold import Isomap iso = Isomap(n_components=3, n_neighbors=7) fdata = iso.fit_transform(authors) show_figure(fdata, labels, ulabs, 'ISOMAP') # LLE print('LLE') from sklearn.manifold import LocallyLinearEmbedding
import numpy as np N = 500 P = 10 MU = [0] * P T = 1 # spike level K = 2 # sparsity level V = list(range(1,K+1)) + [0]*(P-K) V = V / np.linalg.norm(V) SIG = np.identity(P) + T * np.matrix(V).transpose() * np.matrix(V) X = np.matrix(np.random.multivariate_normal(MU,SIG,N)) ##### # using scikit-learn method for Sparse PCA (like an l1-regularized dictionary learning problem) from sklearn.decomposition import SparsePCA spca = SparsePCA(n_components=1, alpha=5) spca.fit(X) from sklearn.decomposition import PCA pca = PCA(n_components=1) pca.fit(X) print('Classical 1st principal component:', pca.components_) print('Sparse 1st principal component:', spca.components_) ##### # TODO: SDP implementation a la El Ghaoui, Bach, D'Aspremont import cvxopt # TWO CONSTRAINTS # trace = 1 (multiply with identity)
if regen_dicts: train_trans, test_trans = regression_translation(model=model) pickle.dump((train_trans, test_trans), open(filename, 'w+')) else: train_trans, test_trans = pickle.load(open(filename, 'r')) print "training translation with model:{} scored:{}".format( model, translation_quality(train_trans)) print "testing translation with model:{} scored:{}".format( model, translation_quality(test_trans)) try_search = False if try_search: dims = [2, 5, 10, 20, 30, None] for dim in dims: pre_transforms = [ TruncatedSVD(n_components=dim), PCA(n_components=dim), # KernelPCA(n_components=dim, kernel='rbf'), #TODO fix memory error / try on larger machine SparsePCA(n_components=dim) ] # , # DictionaryLearning(n_components=dim)] for pre_transform in pre_transforms: embedding_translation = nn_embedding_translate( pre_transform=pre_transform) print "Translation with pt:{} scored:{}".format( pre_transform, translation_quality(embedding_translation)) print translation_quality(en_2_es)
def spca(X, n_comp, random_state): spca = SparsePCA(n_components=n_comp, random_state=random_state) spca.fit(X) return spca
test[c] = lbl.transform(list(test[c].values)) n_comp = 12 # tSVD tsvd = TruncatedSVD(n_components=n_comp, random_state=420) tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1)) tsvd_results_test = tsvd.transform(test) # PCA # pca = PCA(n_components=n_comp, random_state=420) # pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1)) # pca2_results_test = pca.transform(test) #sparse PCA spca = SparsePCA(n_components=n_comp, random_state=420) spca2_results_train = spca.fit_transform(train.drop(["y"], axis=1)) spca2_results_test = spca.transform(test) #Kernel PCA kpca = KernelPCA(n_components=n_comp, random_state=420) kpca2_results_train = kpca.fit_transform(train.drop(["y"], axis=1)) kpca2_results_test = kpca.transform(test) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1)) ica2_results_test = ica.transform(test) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
def transform(xTrain,yTrain,xTest): pca = SparsePCA(n_components=2); newXTrain = pca.fit_transform(xTrain,yTrain) newXTest = pca.transform(xTest) return newXTrain,newXTest
from sklearn.decomposition import SparsePCA from sklearn.preprocessing import StandardScaler #convert RData to pd DataFrame readin = pyreadr.read_r('C:/Users/TW/Downloads/west.RData') westdf = readin["west"] chapter = westdf[['chapter']] #propossesing the data westdf = westdf.drop(['chapter'], axis=1) #delete 'chapter' column 408*302 x = westdf.loc[:, :].values x = StandardScaler(with_std=False).fit_transform(x) #centerlize the data #SparsePCA transform transformer = SparsePCA(n_components=3,\ alpha=0.1,\ normalize_components=True,\ random_state=0) x_transformed = transformer.fit_transform(x) # for data analysis x_transformed.shape transformer.alpha egienvetors = transformer.components_ transformer.error_ transformer.get_params(deep=True) np.mean(transformer.components_ == 0) westspca = pd.DataFrame(data=egienvetors, columns=westdf.columns) Spca1 = westspca.sort_values(by=[0], axis=1) Spca2 = westspca.sort_values(by=[1], axis=1) Spca3 = westspca.sort_values(by=[2], axis=1) Spca4 = westspca.sort_values(by=[3], axis=1)
'phate': lambda args: PHATE(n_components=args.dims, n_jobs=args.njobs) if PHATE_AVAILABLE else _embedding_error(), 'pmf': lambda args: NimfaWrapper(nimfa.Pmf, args.dims), 'psmf': lambda args: NimfaWrapper(nimfa.Psmf, args.dims), 'saucie': lambda args: SaucieWrapper(args.dims) if SAUCIE_AVAILABLE else _embedding_error(), 'scscope': lambda args: ScScope(args.dims) if SCSCOPE_AVAILABLE else _embedding_error, 'sepnmf': lambda args: NimfaWrapper(nimfa.SepNMF, args.dims), 'spca': lambda args: SparsePCA( n_components=args.dims, n_jobs=args.njobs, normalize_components=True), 'spca-batch': lambda args: MiniBatchSparsePCA( n_components=args.dims, n_jobs=args.njobs, normalize_components=True), 'spectral': lambda args: SpectralEmbedding(n_components=args.dims, n_jobs=args.njobs), 'snmf': lambda args: NimfaWrapper(nimfa.Snmf, args.dims), 'srp': lambda args: SparseRandomProjection(n_components=args.dims), 'tga': lambda args: TGA(n_components=args.dims) if TGA_AVAILABLE else _embedding_error(), 'tsvd': lambda args: TruncatedSVD(n_components=args.dims), 'tsne':
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs if sys.platform == 'win32': # fake parallelism for win32 import sklearn.externals.joblib.parallel as joblib_par _mp = joblib_par.multiprocessing joblib_par.multiprocessing = None try: spca = SparsePCA(n_components=3, n_jobs=2, random_state=0, alpha=alpha).fit(Y) U2 = spca.transform(Y) finally: joblib_par.multiprocessing = _mp else: # we can efficiently use parallelism spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0).fit(Y) U2 = spca.transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def textSimilarity(): NeighborDirectory = GEOTEXT_HOME # matplotlib.use('Agg') DATA_FOLDER = userTextDirectory # DATA_FOLDER = "/GEOTEXT_HOME/af/Downloads/review_polarity/txt_sentoken" K_FOLD = 10 data_target = load_files(DATA_FOLDER, encoding=encoding) filenames = data_target.filenames DO_PCA = True DO_SPARSEPCA = False Reduction_D = 100 DO_SVD = False categories = data_target.target_names DO_NMF = False def size_mb(docs): return sum(len(s.encode(encoding)) for s in docs) / 1e6 data_size_mb = size_mb(data_target.data) print("%d documents - %0.3fMB (all data set)" % ( len(data_target.data), data_size_mb)) print("%d categories" % len(categories)) print() # split a training set and a test set target = data_target.target print("Extracting features from all the dataset using a sparse vectorizer") t0 = 0 vectorizer = TfidfVectorizer(use_idf=True, norm='l2', binary=False, sublinear_tf=True, min_df=2, max_df=0.2, ngram_range=(1, 1), stop_words='english') # vectorizer = CountVectorizer(min_df=2, max_df=1.0, ngram_range=(1, 4)) # the output of the fit_transform (x_train) is a sparse csc matrix. data = vectorizer.fit_transform(data_target.data) print data.dtype data = csr_matrix(data, dtype=float32) print data.dtype duration = 1 print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration)) print("n_samples: %d, n_features: %d" % data.shape) print() if DO_PCA: print("dimension reduction pca with d=%d" % Reduction_D) pca = PCA(n_components=Reduction_D, copy=True, whiten=False) print type(data) data = pca.fit_transform(data.todense()) if DO_SPARSEPCA: print("dimension reduction sparsepca with d=%d" % Reduction_D) spca = SparsePCA(Reduction_D) data = spca.fit_transform(data.toarray()) if DO_SVD: print("dimension reduction svd with d=%d" % Reduction_D) svd = TruncatedSVD(n_components=Reduction_D, algorithm="randomized", n_iterations=5, random_state=None, tol=0) data = svd.fit_transform(data) if DO_NMF: print("dimension reduction nmf with d=%d" % Reduction_D) nmf = NMF(n_components=Reduction_D) data = nmf.fit_transform(data) DO_CHI = False if DO_CHI: print("Extracting best features by a chi-squared test") ch2NumFeatures = 1000 ch2 = SelectKBest(chi2, k=ch2NumFeatures) # print vectorizer.get_stop_words() data = ch2.fit_transform(data, target) # print data KNN = 10 nn = NearestNeighbors(n_neighbors=KNN + 1, algorithm='ball_tree').fit(data) # query and data are the same so every node is counted as its most similar here distances, indices = nn.kneighbors(data) with codecs.open(path.join(NeighborDirectory, 'neighbors.txt'), 'w', encoding) as outf: nodeIndex = -1 nodeNeighbors = [] for neighbors in indices: nodeIndex += 1 outf.write(path.basename(filenames[nodeIndex]) + ' ') for neighbor in neighbors: if neighbor == nodeIndex: continue else: outf.write(path.basename(filenames[neighbor]) + ' ') outf.write('\n')
def _cluster_analysis(feats_data, save_name, is_color_time=False): #%% ##### Clustering analysis df_n = feats_data.dropna() df = df_n[set_feats].copy() index_data = df_n[index_cols].reset_index() X = df.values.copy() #[x_min==x_max] x_min, x_max = df.min(), df.max() df = (df - x_min)/(x_max - x_min) X = df.values #%% #### labels and indexes vectors nz = int(np.ceil(np.log10(index_data['time_group']+ 0.001).max())) time_g_str = [('%1.1f' % x).zfill(nz+2) for x in index_data['time_group'].values] cohort_str = [str(int(x)) for x in index_data['cohort_n']] labels = ['C{}_T{}'.format(*x) for x in zip(cohort_str, time_g_str)] label_order = sorted(list(set(labels))) uC = sorted(list(set(cohort_str))) uT = sorted(list(set(time_g_str))) filled_markers = ('o', 's', 'v', '^', '<', '>', '8', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X') if is_color_time: cols = sns.color_palette("RdYlGn", len(uT)) col_dict_u = {k : v for k,v in zip(time_g_str, cols)} col_dict = {ll : col_dict_u[tt] for ll, tt in zip(labels, time_g_str)} mks_dict = {x : filled_markers[ii] for ii, x in enumerate(uC)} mks = [mks_dict[x[1]] for x in label_order] else: cols = sns.color_palette("colorblind", len(uC)) col_dict_u = {k : v for k,v in zip(uC, cols)} col_dict = {ll : col_dict_u[tt] for ll, tt in zip(labels, cohort_str)} mks_dict = {x : filled_markers[ii] for ii, x in enumerate(uT)} mks = [mks_dict[x.partition('_T')[-1]] for x in label_order] #%% tsne = TSNE(n_components=2, #perplexity = 21, init='pca', verbose=1, n_iter=10000 ) X_tsne = tsne.fit_transform(X) #%% pca_s = SparsePCA() X_pca_s = pca_s.fit_transform(X) pca = PCA() X_pca = pca.fit_transform(X) #%% dat = {'t-SNE':X_tsne, 'PCA':X_pca, 'PCA_Sparse':X_pca_s} with PdfPages(save_name) as pdf_pages: for k, Xp in dat.items(): _plot_clusters(Xp, labels, label_order, col_dict, mks) plt.title(k) pdf_pages.savefig() plt.close() #%% return dat
def niftidecomp_workflow( decompaxis, datafile, outputroot, datamaskname=None, decomptype="pca", pcacomponents=0.5, icacomponents=None, varnorm=True, demean=True, sigma=0.0, ): print(f"Will perform {decomptype} analysis along the {decompaxis} axis") if decompaxis == "temporal": decompaxisnum = 1 transposeifspatial = lambda *a, **k: None else: decompaxisnum = 0 transposeifspatial = np.transpose # save the command line tide_io.writevec([" ".join(sys.argv)], outputroot + "_commandline.txt") # read in data print("reading in data arrays") ( datafile_img, datafile_data, datafile_hdr, datafiledims, datafilesizes, ) = tide_io.readfromnifti(datafile) if datamaskname is not None: ( datamask_img, datamask_data, datamask_hdr, datamaskdims, datamasksizes, ) = tide_io.readfromnifti(datamaskname) xsize, ysize, numslices, timepoints = tide_io.parseniftidims(datafiledims) xdim, ydim, slicethickness, tr = tide_io.parseniftisizes(datafilesizes) # check dimensions if datamaskname is not None: print("checking mask dimensions") if not tide_io.checkspacedimmatch(datafiledims, datamaskdims): print("input mask spatial dimensions do not match image") exit() if not (tide_io.checktimematch(datafiledims, datamaskdims) or datamaskdims[4] == 1): print("input mask time dimension does not match image") exit() # save the command line tide_io.writevec([" ".join(sys.argv)], outputroot + "_commandline.txt") # smooth the data if sigma > 0.0: print("smoothing data") for i in range(timepoints): datafile_data[:, :, :, i] = tide_filt.ssmooth(xdim, ydim, slicethickness, sigma, datafile_data[:, :, :, i]) # allocating arrays print("reshaping arrays") numspatiallocs = int(xsize) * int(ysize) * int(numslices) rs_datafile = datafile_data.reshape((numspatiallocs, timepoints)) print("masking arrays") if datamaskname is not None: if datamaskdims[4] == 1: proclocs = np.where(datamask_data.reshape(numspatiallocs) > 0.5) else: proclocs = np.where( np.mean(datamask_data.reshape((numspatiallocs, timepoints)), axis=1) > 0.5) rs_mask = datamask_data.reshape( (numspatiallocs, timepoints))[proclocs, :] rs_mask = np.where(rs_mask > 0.5, 1.0, 0.0)[0] else: datamaskdims = [1, xsize, ysize, numslices, 1] themaxes = np.max(rs_datafile, axis=1) themins = np.min(rs_datafile, axis=1) thediffs = (themaxes - themins).reshape(numspatiallocs) proclocs = np.where(thediffs > 0.0) procdata = rs_datafile[proclocs, :][0] print(rs_datafile.shape, procdata.shape) # normalize the individual images if demean: print("demeaning array") themean = np.mean(procdata, axis=decompaxisnum) print("shape of mean", themean.shape) for i in range(procdata.shape[1 - decompaxisnum]): if decompaxisnum == 1: procdata[i, :] -= themean[i] else: procdata[:, i] -= themean[i] else: themean = np.ones(procdata.shape[1 - decompaxisnum]) if varnorm: print("variance normalizing array") thevar = np.var(procdata, axis=decompaxisnum) print("shape of var", thevar.shape) for i in range(procdata.shape[1 - decompaxisnum]): if decompaxisnum == 1: procdata[i, :] /= thevar[i] else: procdata[:, i] /= thevar[i] procdata = np.nan_to_num(procdata) else: thevar = np.ones(procdata.shape[1 - decompaxisnum]) # applying mask if datamaskdims[4] > 1: procdata *= rs_mask # now perform the decomposition if decomptype == "ica": print("performing ica decomposition") if icacomponents is None: print("will return all significant components") else: print("will return", icacomponents, "components") thefit = FastICA(n_components=icacomponents).fit( transposeifspatial(procdata)) # Reconstruct signals if icacomponents is None: thecomponents = transposeifspatial(thefit.components_[:]) print(thecomponents.shape[1], "components found") else: thecomponents = transposeifspatial( thefit.components_[0:icacomponents]) print("returning first", thecomponents.shape[1], "components found") else: print("performing pca decomposition") if pcacomponents < 1.0: print( "will return the components accounting for", pcacomponents * 100.0, "% of the variance", ) else: print("will return", pcacomponents, "components") if decomptype == "pca": thepca = PCA(n_components=pcacomponents) else: thepca = SparsePCA(n_components=pcacomponents) thefit = thepca.fit(transposeifspatial(procdata)) thetransform = thepca.transform(transposeifspatial(procdata)) theinvtrans = transposeifspatial( thepca.inverse_transform(thetransform)) if pcacomponents < 1.0: thecomponents = transposeifspatial(thefit.components_[:]) print("returning", thecomponents.shape[1], "components") else: thecomponents = transposeifspatial( thefit.components_[0:pcacomponents]) # save the eigenvalues print("variance explained by component:", 100.0 * thefit.explained_variance_ratio_) tide_io.writenpvecs( 100.0 * thefit.explained_variance_ratio_, outputroot + "_explained_variance_pct.txt", ) if decompaxis == "temporal": # save the components print("writing component timecourses") tide_io.writenpvecs(thecomponents, outputroot + "_components.txt") # save the singular values print("writing singular values") tide_io.writenpvecs(np.transpose(thesingvals), outputroot + "_singvals.txt") # save the coefficients print("writing out the coefficients") coefficients = thetransform print("coefficients shape:", coefficients.shape) theheader = datafile_hdr theheader["dim"][4] = coefficients.shape[1] tempout = np.zeros((numspatiallocs, coefficients.shape[1]), dtype="float") tempout[proclocs, :] = coefficients[:, :] tide_io.savetonifti( tempout.reshape( (xsize, ysize, numslices, coefficients.shape[1])), datafile_hdr, outputroot + "_coefficients", ) # unnormalize the dimensionality reduced data for i in range(numspatiallocs): theinvtrans[i, :] = thevar[i] * theinvtrans[i, :] + themean[i] else: # save the component images print("writing component images") theheader = datafile_hdr theheader["dim"][4] = thecomponents.shape[1] tempout = np.zeros((numspatiallocs, thecomponents.shape[1]), dtype="float") tempout[proclocs, :] = thecomponents[:, :] tide_io.savetonifti( tempout.reshape( (xsize, ysize, numslices, thecomponents.shape[1])), datafile_hdr, outputroot + "_components", ) # save the coefficients print("writing out the coefficients") coefficients = np.transpose(thetransform) tide_io.writenpvecs(coefficients, outputroot + "_coefficients.txt") # unnormalize the dimensionality reduced data for i in range(timepoints): theinvtrans[:, i] = thevar[i] * theinvtrans[:, i] + themean[i] print("writing fit data") theheader = datafile_hdr theheader["dim"][4] = theinvtrans.shape[1] tempout = np.zeros((numspatiallocs, theinvtrans.shape[1]), dtype="float") tempout[proclocs, :] = theinvtrans[:, :] tide_io.savetonifti( tempout.reshape((xsize, ysize, numslices, theinvtrans.shape[1])), datafile_hdr, outputroot + "_fit", )
cnt=0 feature=[[0 for i in range(0,n_feat)] for j in range(0,120542)] #80362 for line in fin: a=line.split(" ") for i in range(2,n_feat): feature[cnt][i-2]=float(a[i].split(":")[1]) cnt+=1 print cnt #print feature[cnt-1] X=np.array(feature) ''' pca=PCA(n_components=n_feat) pca_result=pca.fit_transform(X) ''' pca=SparsePCA(n_components=n_feat,alpha=0.6,n_jobs=2,max_iter=15) pca_result=pca.fit_transform(X) #print pca_result[0] cnt=0 fin = open("data/feature/train_gh_97a",'r') for line in fin: a=line.split(" ") PCA_d=50 for i in range(0,PCA_d): a[i+2]=str(i)+":"+str(feature[cnt][i]) ll=" ".join(a[0:PCA_d+2]) fo.write(ll+"\n") cnt+=1 fo.close()
for f in cats: count += 1 if count > n: break try: cat = io.imread("sparse-cats/"+f,as_grey=True).flatten() cat.shape = (40000,1) images = np.append(images, cat, axis=1) except: count -= 1 continue print("loaded cats...") tic = time.clock() print("starting learning...") pca = SparsePCA(n_components=n,max_iter=1000) x = pca.fit_transform(images,subject) print("learning done...") toc = time.clock() print(x) out = np.zeros(40000) print("starting transform...") for i in range(40000): for j in range(n): #out[i] += (x[i,j]) out[i] += (images[i,j] * x[i,j]) out.shape = (200,200) print(out) name = re.match("people/([a-z]*)_small.jpg",filename).group(1)
def fit(self, dif_df): factorization = SparsePCA(n_components=self.n_components, alpha=0.03) X = dif_df.values[1:] self.ticker_symbols_used = dif_df.columns.values factorization.fit(X) self.factorization = factorization
class SPCA(object): """ Wrapper for sklearn package. Performs sparse PCA SPCA has 5 methods: - fit(waveforms) update class instance with ICA fit - fit_transform() do what fit() does, but additionally return the projection onto ICA space - inverse_transform(A) inverses the decomposition, returns waveforms for an input A, using Z - get_basis() returns the basis vectors Z^\dagger - get_params() returns metadata used for fits. """ def __init__(self, num_components=10, catalog_name='unknown', alpha = 0.1, ridge_alpha = 0.01, max_iter = 2000, tol = 1e-9, n_jobs = 1, random_state = None): self._decomposition = 'Sparse PCA' self._num_components = num_components self._catalog_name = catalog_name self._alpha = alpha self._ridge_alpha = ridge_alpha self._n_jobs = n_jobs self._max_iter = max_iter self._tol = tol self._random_state = random_state self._SPCA = SparsePCA(n_components=self._num_components, alpha = self._alpha, ridge_alpha = self._ridge_alpha, n_jobs = self._n_jobs, max_iter = self._max_iter, tol = self._tol, random_state = self._random_state) def fit(self,waveforms): # TODO make sure there are more columns than rows (transpose if not) # normalize waveforms self._waveforms = waveforms self._SPCA.fit(self._waveforms) def fit_transform(self,waveforms): # TODO make sure there are more columns than rows (transpose if not) # normalize waveforms self._waveforms = waveforms self._A = self._SPCA.fit_transform(self._waveforms) return self._A def inverse_transform(self,A): # convert basis back to waveforms using fit new_waveforms = self._SPCA.inverse_transform(A) return new_waveforms def get_params(self): # TODO know what catalog was used! (include waveform metadata) params = self._SPCA.get_params() params['num_components'] = params.pop('n_components') params['Decompositon'] = self._decomposition return params def get_basis(self): """ Return the SPCA basis vectors (Z^\dagger)""" Zt = self._SPCA.components_ return Zt
res_poly = compare_KernelPCA(kernel='poly') res_rbf = compare_KernelPCA(kernel='rbf') res_sigmoid = compare_KernelPCA(kernel='sigmoid') res_cosine = compare_KernelPCA(kernel='cosine') kernel_pca_precomputed = KernelPCA(n_components=kernel_pca_n_comp, kernel='precomputed') kernel_pca_precomputed_data = kernel_pca_precomputed.fit_transform(data.dot(data.T)) kernel_pca_precomputed.lambdas_.round(3) # --- # ## Модификации метода главных компонент # ### SparcePCA sparse_pca_lars = SparsePCA(2, method='lars') sparse_pca_lars_data = sparse_pca_lars.fit_transform(data) print("Sparse PCA with lars method components") print(sparse_pca_lars.components_) sparse_pca_cd = SparsePCA(2, method='cd') sparse_pca_cd_data = sparse_pca_cd.fit_transform(data) print("Sparse PCA with cd method components") print(sparse_pca_cd.components_) fig, axs = plt.subplots(1,2) fig.set_figwidth(11)
def main(): accounts = csv_to_dict('accounts.csv', 0, cast_evals=[str, read_time, readOutcome], type="account") account_nodes = csv_to_dict('nodevisits.csv', 1, cast_evals=[str, str, read_time, str], type="node") account_submissions = csv_to_dict('submissions.csv', 1, cast_evals=[str, str, read_time, str, str], type="submission") account_visits = account_nodes for acc in account_visits: account_visits[acc].extend(account_submissions[acc]) account_visits[acc] = sorted(account_visits[acc], key=lambda k: k['time']) session_length(account_visits) #Build sessions based on time scale determined from previous code as 15 minutes sessions = [] for acc in account_visits: actions = [] for idx, visit in enumerate(account_visits[acc]): if idx == 0: actions = {"node": [], "submission": [], "learning_outcome": accounts[acc][0]["learning_outcome"]} actions[account_visits[acc][idx]["type"]].append(visit) else: #Time between visits in minutes delta_time = delta_minutes(visit["time"], account_visits[acc][idx-1]["time"]) #New session, defined as 15 minutes from above if delta_time > 15: sessions.append(actions) actions = {"node": [], "submission": [], "learning_outcome": accounts[acc][0]["learning_outcome"]} actions[account_visits[acc][idx]["type"]].append(visit) else: actions[account_visits[acc][idx]["type"]].append(visit) sessions.append(actions) for session in sessions: if len(session["node"]) > 0 and len(session["submission"]) > 0: session["start_time"] = min(session["node"][0]["time"], session["submission"][0]["time"]) session["end_time"] = max(session["node"] [len(session["node"]) -1]["time"], session["submission"] [len(session["submission"]) -1]["time"]) elif len(session["node"]) > 0: session["start_time"] = session["node"][0]["time"] session["end_time"] = session["node"] [len(session["submission"]) -1]["time"] else: session["start_time"] = session["submission"][0]["time"] session["end_time"] = session["submission"] [len(session["submission"]) -1]["time"] #Remove sessions without any time difference or no nodes visited sessions = [session for session in sessions if delta_minutes(session["end_time"], session["start_time"]) != 0] X = session_properties(sessions) X = standardize(X) pca = SparsePCA(n_components = 2) #Negative one just makes plot easier to look at, PCA is sign insensitive so no real effect X_r = -1 * pca.fit(X).transform(X) kmeans = cluster.KMeans(n_clusters=4) group = kmeans.fit_predict(X_r) fig = plt.figure(figsize=(6,6)) ax = fig.add_subplot(111) plt.rc('font', family='serif', size=20) ax.set_xticklabels([]) ax.set_yticklabels([]) ax.scatter(X_r[:,0], X_r[:,1],s=20,marker = 'o', c=group) plt.show() outcomes = np.asarray([session["learning_outcome"] for session in sessions]) session_by_outcome = [] tags = [] labels = get_labels(X_r, group, 4) for result in range(0, 4): session_by_outcome.append(group[outcomes == result]) if result == 0: tags.append("No certificate achieved") else: tags.append("Mastery Level = " + str(result)) plot_hist(session_by_outcome, x_min = 0, x_max = 4, y_min = 0, y_max = 1, bins = 4, tags = tags, y_label = "Fraction of sessions", labels=labels)