예제 #1
0
def test_initialization():
    rng = np.random.RandomState(0)
    U_init = rng.randn(5, 3)
    V_init = rng.randn(3, 4)
    model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0, random_state=rng)
    model.fit(rng.randn(5, 4))
    assert_array_equal(model.components_, V_init)
예제 #2
0
def test_fit_transform_tall():
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng)  # tall array
    spca_lars = SparsePCA(n_components=3, method="lars", random_state=rng)
    U1 = spca_lars.fit_transform(Y)
    spca_lasso = SparsePCA(n_components=3, method="cd", random_state=rng)
    U2 = spca_lasso.fit(Y).transform(Y)
    assert_array_almost_equal(U1, U2)
예제 #3
0
def test_transform_nan():
    # Test that SparsePCA won't return NaN when there is 0 feature in all
    # samples.
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    Y[:, 0] = 0
    estimator = SparsePCA(n_components=8)
    assert_false(np.any(np.isnan(estimator.fit_transform(Y))))
 def sparse_pca(self):
     """
     Runs PCA on view and returns projected view, the principle components,
     and explained variance.
     """
     model = SparsePCA(n_components=param['components'], alpha=param['sparse_pca_alpha'])
     model.fit(self.view)
     return model.transform(self.view), model.components_
예제 #5
0
def sccodedirect():
    "得到不带眼镜的RPCA结果"
    nglassmodel = np.load('nglassline.npy').astype('f')
    from sklearn.decomposition import SparsePCA
    learning = SparsePCA(500,verbose=True)
    learning.fit(nglassmodel)
    import cPickle
    cPickle.dump(learning,file('sparsepcadirect','wb'),-1)
예제 #6
0
def test_scaling_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=rng, normalize_components=True)
    results_train = spca_lars.fit_transform(Y)
    results_test = spca_lars.transform(Y[:10])
    assert_allclose(results_train[0], results_test[0])
예제 #7
0
def do_sparse_pca(sparse_matrix):
    # from skikit learn http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.SparsePCA.html#sklearn.decomposition.SparsePCA

    dense_matrix = sparse_matrix.tobsr().toarray()
    # instantiate the spca with some parameters
    spca = SparsePCA(n_components=6, alpha=0.01, ridge_alpha=0.01, max_iter=1000, tol=1e-08, method='lars', n_jobs=1, U_init=None, V_init=None, verbose=False, random_state=None)

    # train the spca with our matrix
    spca.fit(dense_matrix)

    # return the components
    return spca.components_
예제 #8
0
def test_initialization(norm_comp):
    rng = np.random.RandomState(0)
    U_init = rng.randn(5, 3)
    V_init = rng.randn(3, 4)
    model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0,
                      random_state=rng, normalize_components=norm_comp)
    model.fit(rng.randn(5, 4))
    if norm_comp:
        assert_allclose(model.components_,
                        V_init / np.linalg.norm(V_init, axis=1)[:, None])
    else:
        assert_allclose(model.components_, V_init)
예제 #9
0
def test_correct_shapes():
    rng = np.random.RandomState(0)
    X = rng.randn(12, 10)
    spca = SparsePCA(n_components=8, random_state=rng)
    U = spca.fit_transform(X)
    assert_equal(spca.components_.shape, (8, 10))
    assert_equal(U.shape, (12, 8))
    # test overcomplete decomposition
    spca = SparsePCA(n_components=13, random_state=rng)
    U = spca.fit_transform(X)
    assert_equal(spca.components_.shape, (13, 10))
    assert_equal(U.shape, (12, 13))
예제 #10
0
    def __init__(self, num_components=10,
                 catalog_name='unknown',
                 alpha = 0.1,
                 ridge_alpha = 0.01,
                 max_iter = 2000,
                 tol = 1e-9,
                 n_jobs = 1,
                 random_state = None):

        self._decomposition  = 'Sparse PCA'
        self._num_components = num_components
        self._catalog_name   = catalog_name
        self._alpha          = alpha
        self._ridge_alpha    = ridge_alpha
        self._n_jobs         = n_jobs
        self._max_iter       = max_iter
        self._tol            = tol
        self._random_state   = random_state

        self._SPCA = SparsePCA(n_components=self._num_components,
                              alpha        = self._alpha,
                              ridge_alpha  = self._ridge_alpha,
                              n_jobs       = self._n_jobs,
                              max_iter     = self._max_iter,
                              tol          = self._tol,
                              random_state = self._random_state)
예제 #11
0
def test_pca_vs_spca():
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
    Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)
    spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2,
                     normalize_components=True)
    pca = PCA(n_components=2)
    pca.fit(Y)
    spca.fit(Y)
    results_test_pca = pca.transform(Z)
    results_test_spca = spca.transform(Z)
    assert_allclose(np.abs(spca.components_.dot(pca.components_.T)),
                    np.eye(2), atol=1e-5)
    results_test_pca *= np.sign(results_test_pca[0, :])
    results_test_spca *= np.sign(results_test_spca[0, :])
    assert_allclose(results_test_pca, results_test_spca)
예제 #12
0
def spca(data, num_components=None, alpha=1):
		# creates a matrix with sparse principal component analysis
		# build matrix with all data
		data = [d.flatten() for d in data if not any(isnan(d))]
		datamatrix = row_stack(data)
		
		# center data
		cdata = datamatrix - mean(datamatrix, axis=0)
		
		if num_components is None:
			num_components = cdata.shape[0]
		
		# do spca on matrix
		spca = SparsePCA(n_components=num_components, alpha=alpha)
		spca.fit(cdata)
		
		# normalize components
		components = spca.components_.T
		for r in xrange(0,components.shape[1]):
			compnorm = numpy.apply_along_axis(numpy.linalg.norm, 0, components[:,r])
			if not compnorm == 0:
				components[:,r] /= compnorm
		components = components.T
		
		# calc adjusted explained variance from "Sparse Principal Component Analysis" by Zou, Hastie, Tibshirani
		spca.components_ = components
		#nuz = spca.transform(cdata).T
		nuz = ridge_regression(spca.components_.T, cdata.T, 0.01, solver='dense_cholesky').T
		
		#nuz = dot(components, cdata.T)
		q,r = qr(nuz.T)
		cumulative_var = []
		for i in range(1,num_components+1):
			cumulative_var.append(trace(r[0:i,]*r[0:i,]))
		explained_var = [math.sqrt(cumulative_var[0])]
		for i in range(1,num_components):
			explained_var.append(math.sqrt(cumulative_var[i])-math.sqrt(cumulative_var[i-1]))
		
		order = numpy.argsort(explained_var)[::-1]
		components = numpy.take(components,order,axis=0)
		evars = numpy.take(explained_var,order).tolist()
		#evars = numpy.take(explained_var,order)
		#order2 = [0,1,2,4,5,7,12,19]
		#components = numpy.take(components,order2,axis=0)
		#evars = numpy.take(evars,order2).tolist()
		
		return components, evars
예제 #13
0
def test_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=0)
    spca_lars.fit(Y)
    U1 = spca_lars.transform(Y)
    # Test multiple CPUs
    spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha,
                     random_state=0).fit(Y)
    U2 = spca.transform(Y)
    assert_true(not np.all(spca_lars.components_ == 0))
    assert_array_almost_equal(U1, U2)
    # Test that CD gives similar results
    spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0,
                           alpha=alpha)
    spca_lasso.fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
예제 #14
0
def test_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3,
                          method='lars',
                          alpha=alpha,
                          random_state=0)
    spca_lars.fit(Y)

    # Test that CD gives similar results
    spca_lasso = SparsePCA(n_components=3,
                           method='cd',
                           random_state=0,
                           alpha=alpha)
    spca_lasso.fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_fit_transform_tall(norm_comp):
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng)  # tall array
    spca_lars = SparsePCA(n_components=3, method='lars',
                          random_state=rng, normalize_components=norm_comp)
    U1 = spca_lars.fit_transform(Y)
    spca_lasso = SparsePCA(n_components=3, method='cd',
                           random_state=rng, normalize_components=norm_comp)
    U2 = spca_lasso.fit(Y).transform(Y)
    assert_array_almost_equal(U1, U2)
예제 #16
0
def process_dim_reduction(method='pca', n_dim=10):
    """
    Default linear dimensionality reduction method. For each method, return a
    BaseEstimator instance corresponding to the method given as input.

	Attributes
    -------
    method: str, default to 'pca'
    	Method used for dimensionality reduction.
    	Implemented: 'pca', 'ica', 'fa' (Factor Analysis), 
    	'nmf' (Non-negative matrix factorisation), 'sparsepca' (Sparse PCA).
    
    n_dim: int, default to 10
    	Number of domain-specific factors to compute.

    Return values
    -------
    Classifier, i.e. BaseEstimator instance
    """

    if method.lower() == 'pca':
        clf = PCA(n_components=n_dim)

    elif method.lower() == 'ica':
        print('ICA')
        clf = FastICA(n_components=n_dim)

    elif method.lower() == 'fa':
        clf = FactorAnalysis(n_components=n_dim)

    elif method.lower() == 'nmf':
        clf = NMF(n_components=n_dim)

    elif method.lower() == 'sparsepca':
        clf = SparsePCA(n_components=n_dim,
                        alpha=10.,
                        tol=1e-4,
                        verbose=10,
                        n_jobs=1)

    elif method.lower() == 'pls':
        clf = PLS(n_components=n_dim)

    else:
        raise NameError('%s is not an implemented method' % (method))

    return clf
예제 #17
0
def new(stop_words=[],decomposition='SVD',n_components=5):

  # Prepare vectoriser engines
  idf = TfidfVectorizer(
    ngram_range=(1,3), #Unigram,bigram,& trigram
    stop_words=stop_words
  )

  # Prepare normaliser
  norm = Normalizer(norm='max')

  print(colored('Texthasher model created','yellow'))

  # Prepare dimensionality reduction
  if decomposition and n_components:
    if decomposition=='LDA': # Results in Non-negative matrix
      reducer = LatentDirichletAllocation( # TFIDF --> Topic term
        n_topics=n_components,
        max_doc_update_iter=20,
        max_iter=8  
      )
      return [idf,norm,reducer]

    elif decomposition=='SVD':
      reducer = TruncatedSVD( # Best for small dataset, 
        n_components,         # nightmare for large dataset
        n_iter=8) # Damn slow

      return [idf,norm,reducer]

    elif decomposition=='PCA':
      # When using IPCA, remember to always keep:
      # n_samples > n_components > batch_size
      # reducer = IncrementalPCA(n_components)

      # Sparse -> Dense greedily consumes large amount of mem
      # to_dense = SparseToDense()

      # return [idf,norm,to_dense,reducer]

      reducer = SparsePCA(n_components)
      return [idf,norm,reducer]

    return [idf,norm]
  else:
    return [idf,norm]
예제 #18
0
def test_fit_transform_parallel():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3,
                          method='lars',
                          alpha=alpha,
                          random_state=0)
    spca_lars.fit(Y)
    U1 = spca_lars.transform(Y)
    # Test multiple CPUs
    spca = SparsePCA(n_components=3,
                     n_jobs=2,
                     method='lars',
                     alpha=alpha,
                     random_state=0).fit(Y)
    U2 = spca.transform(Y)
    assert not np.all(spca_lars.components_ == 0)
    assert_array_almost_equal(U1, U2)
예제 #19
0
def test_correct_shapes():
    rng = np.random.RandomState(0)
    X = rng.randn(12, 10)
    spca = SparsePCA(n_components=8, random_state=rng)
    U = spca.fit_transform(X)
    assert_equal(spca.components_.shape, (8, 10))
    assert_equal(U.shape, (12, 8))
    # test overcomplete decomposition
    spca = SparsePCA(n_components=13, random_state=rng)
    U = spca.fit_transform(X)
    assert_equal(spca.components_.shape, (13, 10))
    assert_equal(U.shape, (12, 13))
def reduce_dimension(name, x, n_components):
    algorithms = {
        'factor_analysis':
        FactorAnalysis(random_state=0, n_components=n_components),
        'fast_ica':
        FastICA(random_state=0, n_components=n_components),
        'nmf':
        Pipeline([('min_max', MinMaxScaler()),
                  ('nmf', NMF(random_state=0, n_components=n_components))]),
        'pca':
        PCA(random_state=0, n_components=n_components),
        'sparse_pca':
        SparsePCA(random_state=0, n_components=n_components),
        'truncated_svd':
        TruncatedSVD(random_state=0, n_components=n_components)
    }
    return Pipeline([(name, algorithms.get(name)),
                     ('min_max', MinMaxScaler())]).fit_transform(x)
예제 #21
0
def test_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=0)
    spca_lars.fit(Y)

    # Test that CD gives similar results
    spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0,
                           alpha=alpha)
    spca_lasso.fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_fit_transform(norm_comp):
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=0, normalize_components=norm_comp)
    spca_lars.fit(Y)

    # Test that CD gives similar results
    spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0,
                           alpha=alpha, normalize_components=norm_comp)
    spca_lasso.fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)

    # Test that deprecated ridge_alpha parameter throws warning
    warning_msg = "The ridge_alpha parameter on transform()"
    assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform,
                         Y, ridge_alpha=0.01)
    assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform,
                         Y, ridge_alpha=None)
예제 #23
0
파일: spca.py 프로젝트: Silver-L/Sparse-PCA
def main():
    parser = argparse.ArgumentParser(description='py, Dirout, EUDT_txt, num_case')

    parser.add_argument('--Dirout', '-i1', default='F:/SPCA_debug/result',
                        help='Dirout_path')
    parser.add_argument('--EUDT_text', '-i2', default='F:/SPCA_debug/input.txt',
                        help='EUDT(training_data_list)(.txt)')
    parser.add_argument('--num_case', '-i3', default='50', help='num of training data(int)',
                        type=int)

    args = parser.parse_args()

    case_size = int(512 * 512 * 1)

    # load data
    print('load data')
    case = np.zeros((args.num_case, case_size))

    with open(args.EUDT_text, 'rt') as f:
        i = 0
        for line in f:
            if i >= args.num_case:
                break
            line = line.split()
            case[i, :] = IO.read_raw(line[0], dtype='double')
            i += 1

    print(case.shape)

    # Prepare for pca
    print('process pca')
    spca = SparsePCA(n_components=args.num_case - 1)

    # Do pca and map to Principal component
    spca.fit(case)

    # # mean_vector
    # mean_vector = pca.mean_

    # components
    U = spca.components_

    for i in range(0, args.num_case - 1):
        IO.write_raw(U[i, :].copy(), args.Dirout + '/vect_' + str(i).zfill(4) + '.vect')  # PCs
예제 #24
0
def make_methods_plot(labeled=True):
    file_out = "methods"
    n_components = 2
    n_neighbors = 20
    methods = {
        "LLE": LocallyLinearEmbedding(n_neighbors=n_neighbors),
        # "Spectral NN": SpectralEmbedding(affinity="nearest_neighbors"),
        # "Spectral RBF": SpectralEmbedding(affinity="rbf"),
        "PCA": PCA(n_components=n_components),
        "IncrementalPCA": IncrementalPCA(n_components=n_components),
        "KernelPCA": KernelPCA(n_components=n_components),
        "SparsePCA": SparsePCA(n_components=n_components),
        "TruncatedSVD": TruncatedSVD(n_components=n_components),
        # f"TSNE(perplexity = {n_neighbors})": TSNE(perplexity=n_neighbors),
    }
    if labeled:
        make_plot_labeled(methods, file_out=f"{file_out}_labeled")
    else:
        make_plot_unlabeled(methods, file_out=f"{file_out}_unlabeled")
예제 #25
0
def test_fit_transform_parallel(norm_comp):
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=0, normalize_components=norm_comp)
    spca_lars.fit(Y)
    U1 = spca_lars.transform(Y)
    # Test multiple CPUs
    spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha,
                     random_state=0, normalize_components=norm_comp).fit(Y)
    U2 = spca.transform(Y)
    assert_true(not np.all(spca_lars.components_ == 0))
    assert_array_almost_equal(U1, U2)
예제 #26
0
 def dim_reduction_method(self):
     """
     select dimensionality reduction method
     """
     if self.dim_reduction=='pca':
         return PCA()
     elif self.dim_reduction=='factor-analysis':
         return FactorAnalysis()
     elif self.dim_reduction=='fast-ica':
         return FastICA()
     elif self.dim_reduction=='kernel-pca':
         return KernelPCA()
     elif self.dim_reduction=='sparse-pca':
         return SparsePCA()
     elif self.dim_reduction=='truncated-svd':
         return TruncatedSVD()
     elif self.dim_reduction!=None:
         raise ValueError('%s is not a supported dimensionality reduction method. Valid inputs are: \
                          "pca","factor-analysis","fast-ica,"kernel-pca","sparse-pca","truncated-svd".' 
                          %(self.dim_reduction))
def get_dim_reds_scikit(pct_features):
	n_components = max(int(pct_features * num_features), 1)
	return [
	LinearDiscriminantAnalysis(n_components=n_components),
	TruncatedSVD(n_components=n_components),
	#SparseCoder(n_components=n_components),
	DictionaryLearning(n_components=n_components),
	FactorAnalysis(n_components=n_components),
	SparsePCA(n_components=n_components),
	NMF(n_components=n_components),
	PCA(n_components=n_components),
	RandomizedPCA(n_components=n_components),
	KernelPCA(kernel="linear", n_components=n_components),
	KernelPCA(kernel="poly", n_components=n_components),
	KernelPCA(kernel="rbf", n_components=n_components),
	KernelPCA(kernel="sigmoid", n_components=n_components),
	KernelPCA(kernel="cosine", n_components=n_components),
	Isomap(n_components=n_components),
	LocallyLinearEmbedding(n_components=n_components, eigen_solver='auto', method='standard'),
	LocallyLinearEmbedding(n_neighbors=n_components, n_components=n_components, eigen_solver='auto', method='modified'),
	LocallyLinearEmbedding(n_neighbors=n_components, n_components=n_components, eigen_solver='auto', method='ltsa'),
	SpectralEmbedding(n_components=n_components)
	]
예제 #28
0
def test_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=0)
    spca_lars.fit(Y)

    # Test that CD gives similar results
    spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0,
                           alpha=alpha)
    spca_lasso.fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)

    # Test that deprecated ridge_alpha parameter throws warning
    warning_msg = "The ridge_alpha parameter on transform()"
    assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform,
                         Y, ridge_alpha=0.01)
    assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform,
                         Y, ridge_alpha=None)
예제 #29
0
def WeightsEstimatedFromSparsePCA(ret_port, n_com=25):
    tf = SparsePCA(n_components=n_com)  # , random_state=0)
    tf.fit(ret_port.agg(lambda x: x - x.mean()).fillna(0.0))  # 注意量级
    tf.transform(
        ret_port.fillna(0.0)
    )  # .apply(lambda x:x.where(~x.isnull(),x.mean()),axis=0))#,index=date_investing[date_investing<'2019-12'])
    # 根据组合的组合的平均收益,调整组合的符号
    weights = pd.DataFrame(tf.components_, columns=signal_names.split(',')).T
    ret_transformed_port = (cov_chara_ret.fillna(0.0) @ weights).replace(
        0.0, np.nan)
    for c in weights.columns:
        weights[c] = weights[c] * np.sign(
            ret_transformed_port[c].mean()) / np.abs(weights[c]).sum()
    ret_transformed_port = (cov_chara_ret.fillna(0.0) @ weights).replace(
        0.0, np.nan)
    # 按t值选,还是按SR选择
    select_port = np.abs(
        PortfolioAnalysis(ret_transformed_port.dropna(
            how='all',
            axis=1))).T.sort_values(by='SR',
                                    ascending=False).index[:int(n_com * 0.67)]
    for p in select_port:
        weights[p] *= np.sign(ret_transformed_port[p].mean())
    return weights[select_port]
예제 #30
0
def WeightsEstimatedFromSparsePCAWithWeightedCovariance(ret_p, n_com=30):
    ret_port = ret_p.dropna(how='all', axis=1)
    tf = SparsePCA(n_components=n_com)  # , random_state=0)
    cov_matrix = WeightedCovariance(ret_port)
    tf.fit(cov_matrix)  # 注意量级
    tf.transform(
        ret_port.fillna(0.0)
    )  # .apply(lambda x:x.where(~x.isnull(),x.mean()),axis=0))#,index=date_investing[date_investing<'2019-12'])
    # 根据组合的组合的平均收益,调整组合的符号
    weights = pd.DataFrame(tf.components_, columns=cov_matrix.columns).T
    ret_transformed_port = (ret_port.fillna(0.0) @ weights).replace(
        0.0, np.nan)
    for c in ret_transformed_port.columns:
        weights[c] = weights[c] * np.sign(
            ret_transformed_port[c].mean()) / np.abs(weights[c]).sum()
    ret_transformed_port = (ret_port.fillna(0.0) @ weights).replace(
        0.0, np.nan)
    # 按t值选,还是按SR选择
    select_port = np.abs(
        PortfolioAnalysis(ret_transformed_port)).T.sort_values(
            by='SR', ascending=False).index
    for p in select_port:
        weights[p] *= np.sign(ret_transformed_port[p].mean())
    return weights[select_port]
예제 #31
0
파일: CCI_main.py 프로젝트: c235gsy/CCI
            matrix_KPCA, clusters)
        # print ("get KPCA BF matrix")
        # print (matrix_KPCA_BF)
        # print ("get KPCA mean matrix")
        # print (matrix_KPCA_mean)
        matrix_KPCA_BF.to_csv(gl.get_value("outputFile") + "_KPCA_BF.txt",
                              sep='\t',
                              header=True,
                              index=True)
        matrix_KPCA_mean.to_csv(gl.get_value("outputFile") + "_KPCA_mean.txt",
                                sep='\t',
                                header=True,
                                index=True)

    if gl.get_value("SPCA_Flag"):
        spca = SparsePCA(n_components=gl.get_value("SPCA_n_components"))
        spca.fit(wholeData)
        expre_SPCA = spca.transform(expre_data)
        # print ("get SPCA data")
        matrix_SPCA = Methods.get_matrix_dist(
            data=expre_SPCA,
            lab=lab,
            clusters=clusters,
            average_number=gl.get_value("SPCA_AvgNum"),
            caculation_number=gl.get_value("SPCA_CalNum"))
        # print ("get SPCA matrix")
        matrix_SPCA_BF = Methods.disMatrix_to_bfMatrix(matrix_SPCA, clusters)
        matrix_SPCA_mean = Methods.disMatrix_to_meanMatrix(
            matrix_SPCA, clusters)
        # print ("get SPCA BF matrix")
        # print (matrix_SPCA_BF)
예제 #32
0
        plt.scatter(X[y == label, 0], X[y == label, 1], color=color, label=class_name)
    plt.title(title)
    plt.legend(loc='best')


# 转换前的可视化, 只显示前两维度的数据
plt.figure(1)
plot_func('origin data')

# KernelPCA 是非线性降维, LDA 只能用于分类降维
# ICA 通常不用于降低维度,而是用于分离叠加信号
models_list = [('LDA', LinearDiscriminantAnalysis(n_components=2)), ('PCA', PCA(n_components=2, random_state=0)),
               ('PCARand', PCA(n_components=2, random_state=0, svd_solver='randomized')),
               ('IncrementalPCA', IncrementalPCA(n_components=2, batch_size=10, whiten=True)), ('FactorAnalysis', FactorAnalysis(n_components=2, max_iter=500)),
               ('FastICA', FastICA(n_components=2, random_state=0)), ('KernelPCA', KernelPCA(n_components=2, random_state=0, kernel='rbf')),
               ('SparsePCA', SparsePCA(n_components=2, random_state=0, verbose=True)),
               ('MiniBatchSparsePCA', MiniBatchSparsePCA(n_components=2, verbose=True, batch_size=10, random_state=0)),
               ('DictionaryLearning', DictionaryLearning(n_components=2, verbose=True, random_state=0)),
               ('MiniBatchDictionaryLearning', MiniBatchDictionaryLearning(n_components=2, batch_size=5, random_state=0, alpha=0.1))]

model = namedtuple('models', ['mod_name', 'mod_ins'])

for i in range(len(models_list)):
    mod = model(*models_list[i])
    if mod.mod_name == 'LDA':
        mod.mod_ins.fit(X, y)
        X_new = mod.mod_ins.transform(X)
    else:
        X_new = mod.mod_ins.fit_transform(X)
    plt.figure(i + 2)
    plot_func(mod.mod_name + ' transformed data')
예제 #33
0
                                           index=validation_index)

scatterPlot(X_train_incrementalPCA, y_train, "Incremental PCA")

# In[ ]:

# Sparse PCA
from sklearn.decomposition import SparsePCA

n_components = 100
alpha = 0.0001
random_state = 2020
n_jobs = -1

sparsePCA = SparsePCA(n_components=n_components,
                      alpha=alpha,
                      random_state=random_state,
                      n_jobs=n_jobs)

sparsePCA.fit(X_train.loc[:10000, :])
X_train_sparsePCA = sparsePCA.transform(X_train)
X_train_sparsePCA = pd.DataFrame(data=X_train_sparsePCA, index=train_index)

X_validation_sparsePCA = sparsePCA.transform(X_validation)
X_validation_sparsePCA = pd.DataFrame(data=X_validation_sparsePCA,
                                      index=validation_index)

scatterPlot(X_train_sparsePCA, y_train, "Sparse PCA")

# In[ ]:

# Kernel PCA
예제 #34
0
def Var_Select(orgdata, k, alphaMax=10, alphastep=0.2):
    """
    orgdata-需要信息压缩的数据框
    k-预期最大需要保留的最大变量个数,实际保留数量不能多于这个数值
    alphaMax-SparsePCA算法惩罚项的最大值,一般要到5才会取得比较理想的结果
    alphastep-SparsePCA算法惩罚项递增的步长
    """
    #step1:当数据量过大时,为了减少不必要的耗时
    if orgdata.iloc[:, 1].count() > 5000:
        data = orgdata.sample(5000)
    else:
        data = orgdata
#step2:引入所需要的包,并且对数据进行标准化
    from sklearn import preprocessing
    import pandas as pd
    import numpy as np
    from sklearn.decomposition import SparsePCA
    #from functools import reduce
    data = preprocessing.scale(data)
    n_components = k
    #pca_n = list()
    #step3:进行SparsePCA计算,选择合适的惩罚项alpha,当恰巧每个原始变量只在一个主成分上有权重时,停止循环
    for i in np.arange(0.1, alphaMax, alphastep):
        pca_model = SparsePCA(n_components=n_components, alpha=i)
        pca_model.fit(data)
        pca = pd.DataFrame(pca_model.components_).T
        n = data.shape[1] - sum(sum(np.array(pca != 0)))  ####计算系数不为0的数量
        if n == 0:
            global best_alpha
            best_alpha = i
            break
    #step4:根据上一步得到的惩罚项的取值,估计SparsePCA,并得到稀疏主成分得分
    pca_model = SparsePCA(n_components=n_components, alpha=best_alpha)
    pca_model.fit(data)
    pca = pd.DataFrame(pca_model.components_).T
    data = pd.DataFrame(data)
    score = pd.DataFrame(pca_model.fit_transform(data))
    #step6:计算原始变量与主成分之间的1-R方值
    r = []
    R_square = []
    for xk in range(data.shape[1]):  # xk输入变量个数
        for paj in range(n_components):  # paj主成分个数
            r.append(
                abs(np.corrcoef(data.iloc[:, xk], score.iloc[:, paj])[0, 1]))
            r_max1 = max(r)
            r.remove(r_max1)
            r.append(-2)
            r_max2 = max(r)
            R_square.append((1 - r_max1**2) / (1 - r_max2**2))

    R_square = abs(
        pd.DataFrame(
            np.array(R_square).reshape((data.shape[1], n_components))))
    var_list = []
    #print(R_square)
    #step7:每个主成分中,选出原始变量的1-R方值最小的。
    for i in range(n_components):
        vmin = R_square[i].min()
        #print(R_square[i])
        #print(vmin)
        #print(R_square[R_square[i] == min][i])
        var_list.append(R_square[R_square[i] == vmin][i].index)

    news_ids = []
    for id in var_list:
        if id not in news_ids:
            news_ids.append(id)
    print(news_ids)
    data_vc = orgdata.iloc[:, np.array(news_ids).reshape(len(news_ids))]
    return data_vc
예제 #35
0
	def sparse_pca(self, n_components, alpha):
		pca = SparsePCA(n_components = 3, alpha = alpha)
		self.X = pca.fit_transform(self.X)
		self.df_c = pd.DataFrame(pca.components_.T, index = self.crimes, columns = [1,2,3])
		return self.df_c
예제 #36
0
    #csv = "c:/iris44.csv"  # wikipedia Iris_flower_data_set
        # 5.1,3.5,1.4,0.2  # ,Iris-setosa ...
    N = 40
    K = 450000
    
    seed = 1
    exec "\n".join( sys.argv[1:] )  # N= ...
    np.random.seed(seed)
    np.set_printoptions( 1, threshold=100, suppress=True )  # .1f
    try:
        A = np.genfromtxt( csv, delimiter="," )
        N, K = A.shape
    except IOError:
        print('error')
        A = np.random.normal( size=(N, K) )  # gen correlated ?

    print(len(A[1]), N, K)
    
    print "A:", A
    #pca = PCA(n_components=4)
    pca = SparsePCA(n_components=None, alpha=1, ridge_alpha=0.01, max_iter=1000, tol=1e-08, method='lars', n_jobs=1, U_init=None, V_init=None, verbose=False, random_state=None)
    scores=pca.fit_transform(A)
    pca_variance = pca.explained_variance_ratio_
    coeff = pca.components_
    #A1=pca.inverse_transform(coeff)
    print(pca_variance)
    print("coeff",coeff)
    #score = pca.transform(A)
    print("score",scores)
    #print A1
    
예제 #37
0
fp_mean.append(0)
fn_mean.append(0)
f1_mean.append(0)

n = 1
for FrameRange_ind in range(len(offset_list)):
    for sparsePCA_alpha_ind in sparsePCA_alpha:
        # for sparsePCA_ridge_alpha_ind in sparsePCA_ridge_alpha:
        # compute PCA
        ncomp = 5
        offset = offset_list[FrameRange_ind]
        upto = upto_list[FrameRange_ind]
        # if ~upto:
        #     upto = O.Shapes().shape[0]
        PCA_start = time.time()
        p = SparsePCA(n_components=ncomp, alpha=sparsePCA_alpha_ind, ridge_alpha=0.01)
        PCA_end = time.time()
        print("The " + str(n) + " PCA time: " + str(PCA_end-PCA_start))
        Projection_start = time.time()
        scorePCA = p.fit_transform(O.Shapes()[offset:upto, :].T).T
        Projection_end = time.time()
        print("The " + str(n) + " Projection time: " + str(Projection_end-Projection_start))
        # explained_variance_ratio = p.explained_variance_ratio_
        plt.figure(1)
        plt.plot(p.components_.T)
        plt.legend(range(5))
        plt.savefig("princomp/" + str(offset) + "to" + str(upto) + "_alpha" + str(sparsePCA_alpha_ind) + ".png", bbox_inches='tight')
        plt.clf()

        plt.figure(2)
        plt.scatter(scorePCA[0, :10000], scorePCA[1, :10000], s=4)
예제 #38
0
class SPCA(object):
    def __init__(self,
                 n_components=None,
                 alpha=1,
                 ridge_alpha=0.01,
                 max_iter=1000,
                 tol=1e-8,
                 method='lars',
                 n_jobs=None,
                 U_init=None,
                 V_init=None,
                 verbose=False,
                 random_state=None,
                 normalize_components='deprecated'):
        """
        :param n_components:
        :param alpha:
        :param ridge_alpha:
        :param max_iter:
        :param tol:
        :param method:
        :param n_jobs:
        :param U_init:
        :param V_init:
        :param verbose:
        :param random_state:
        :param normalize_components:
        """
        self.model = SparsePCA(n_components=n_components,
                               alpha=alpha,
                               ridge_alpha=ridge_alpha,
                               max_iter=max_iter,
                               tol=tol,
                               method=method,
                               n_jobs=n_jobs,
                               U_init=U_init,
                               V_init=V_init,
                               verbose=verbose,
                               random_state=random_state,
                               normalize_components=normalize_components)

    def fit(self, x, y):
        self.model.fit(X=x, y=y)

    def transform(self, x):
        self.model.transform(X=x)

    def fit_transform(self, x, y=None):
        return self.model.fit_transform(X=x, y=y)

    def get_params(self):
        return self.model.get_params(deep=True)

    def set_params(self, **params):
        return self.model.set_params(**params)

    def get_attributes(self):
        components = self.model.components_
        error = self.model.error_
        n_iter = self.model.n_iter_
        mean = self.model.mean_
        return components, error, n_iter, mean
예제 #39
0
def apply_band_selection(technique, dataset, predictions, mode, n_components,
                         df_column_entry_dict):
    if df_column_entry_dict is None:
        df_column_entry_dict = {
        }  # couldn't care less, this is a lazy way to make all accesses work

    print("Dataset current shape: " + str(dataset.shape))

    print_memory_metrics("before applying band selection method " + technique,
                         df_column_entry_dict)

    from DeepHyperX.batch import PARAMETER_JSON
    parameterFile = open(PARAMETER_JSON, "r")
    import json
    data = json.load(parameterFile)
    parameterFile.close()

    if technique in ["IncrementalPCA"]:  # requires special method
        dataset, _ = applyIncrementalPCA(dataset, n_components)

    elif technique in data["image_compression"]["extraction"]["techniques"]:

        extraction_object = None
        if technique == "PCA":
            from sklearn.decomposition import PCA
            """ HybridSN: Exploring 3D-2D CNN Feature Hierarchy for Hyperspectral Image Classification
            Source code used: https://github.com/gokriznastic/HybridSN/blob/master/Hybrid-Spectral-Net.ipynb
            Paper: https://arxiv.org/abs/1902.06701
            Good parameters: 30 components for Indian Pines, 15 for Salinas and Pavia University
            """
            extraction_object = PCA(n_components=n_components, whiten=True)
        elif technique == "KernelPCA":
            from sklearn.decomposition import KernelPCA
            extraction_object = KernelPCA(kernel="rbf",
                                          n_components=n_components,
                                          gamma=None,
                                          fit_inverse_transform=True,
                                          n_jobs=1)
        elif technique == "SparsePCA":
            """Sparse PCA uses the links between the ACP and the SVD to extract the main components by solving a lower-order matrix approximation problem."""
            from sklearn.decomposition import SparsePCA
            extraction_object = SparsePCA(n_components=n_components,
                                          alpha=0.0001,
                                          n_jobs=-1)
        elif technique == "LDA":  # only supervised is supported, y is required
            if mode != "supervised":
                print(
                    "warning: mode other than supervised detected for lda, setting it to supervised...\n"
                )
                mode = "supervised"
            # maximally n_classes - 1 columns, https://stackoverflow.com/questions/26963454/lda-ignoring-n-components
            from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
            extraction_object = LinearDiscriminantAnalysis(
                n_components=n_components)
        elif technique == "SVD":
            from sklearn.decomposition import TruncatedSVD
            extraction_object = TruncatedSVD(n_components=n_components,
                                             algorithm='randomized',
                                             n_iter=5)
        elif technique == "GRP":
            from sklearn.random_projection import GaussianRandomProjection
            extraction_object = GaussianRandomProjection(
                n_components=n_components, eps=0.5)
        elif technique == "SRP":
            from sklearn.random_projection import SparseRandomProjection
            extraction_object = SparseRandomProjection(
                n_components=n_components,
                density='auto',
                eps=0.5,
                dense_output=False)
        elif technique == "MDS":
            """O(n^3), uses lots of memory for distance matrix (doesn't fit in 48GB), doesn't fit in GPU memory either, so basically unusable"""
            from sklearn.manifold import MDS
            extraction_object = MDS(n_components=n_components,
                                    n_init=12,
                                    max_iter=200,
                                    metric=True,
                                    n_jobs=16)
        elif technique == "MiniBatch":
            """takes too long"""
            from sklearn.decomposition import MiniBatchDictionaryLearning
            extraction_object = MiniBatchDictionaryLearning(
                n_components=n_components, batch_size=200, alpha=1, n_iter=1)
        elif technique == "LLE":
            # modified LLE requires n_neighbors >= n_components
            """execution takes 20 minutes or so, but it does work, just takes a long time"""
            from sklearn.manifold import LocallyLinearEmbedding
            extraction_object = LocallyLinearEmbedding(
                n_components=n_components,
                n_neighbors=100,
                method='modified',
                n_jobs=4)
        elif technique == "ICA":
            from sklearn.decomposition import FastICA
            extraction_object = FastICA(n_components=n_components,
                                        algorithm='parallel',
                                        whiten=True,
                                        max_iter=100)
        elif technique == "FactorAnalysis":
            from sklearn.decomposition import FactorAnalysis
            extraction_object = FactorAnalysis(n_components=n_components)  #75
        elif technique == "ISOMAP":
            from sklearn import manifold
            extraction_object = manifold.Isomap(n_neighbors=5,
                                                n_components=n_components,
                                                n_jobs=-1)
        elif technique == "t-SNE":
            # like PCA, but non-linear (pca is linear)
            from sklearn.manifold import TSNE
            extraction_object = TSNE(n_components=n_components,
                                     learning_rate=300,
                                     perplexity=30,
                                     early_exaggeration=12,
                                     init='random')
        elif technique == "UMAP":
            # install umap-learn for this to work
            import umap
            extraction_object = umap.UMAP(n_neighbors=50,
                                          min_dist=0.3,
                                          n_components=n_components)
        elif technique == "NMF":
            # https://www.kaggle.com/remidi/dimensionality-reduction-techniques
            from sklearn.decomposition import NMF
            extraction_object = NMF(n_components=n_components,
                                    init='nndsvdar',
                                    random_state=420)
        elif technique == "F*G":
            # super fast and nice
            from sklearn.cluster import FeatureAgglomeration
            extraction_object = FeatureAgglomeration(n_clusters=n_components,
                                                     linkage='ward')
        else:
            raise ValueError("Unknown feature extraction technique: " +
                             technique)

        start_mem_measurement()
        start = time.time()

        dataset, _ = applyFeatureExtraction(
            dataset,
            predictions,
            extraction_object,
            mode,
            merged=(len(dataset.shape) == 4 and len(predictions.shape) == 3))

        time_elapse = time.time() - start

        event = 'applying band selection method (EXTRACTION) ' + technique
        formatted_time = str(timedelta(seconds=time_elapse))
        df_column_entry_dict['Time measurement at ' + event +
                             ' [s]'] = time_elapse

        print("\n" + event + " took " + formatted_time + " seconds\n")

        event = "after applying band selection method " + technique
        stop_mem_measurement(event, df_column_entry_dict)
        print_memory_metrics(event, df_column_entry_dict)

    elif technique in data["image_compression"]["selection"]["techniques"]:

        selection_object = None
        if technique == "RandomForest":
            # Random forests or random decision forests are an ensemble learning method for classification, regression and other
            # tasks that operates by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees.[1][2] Random decision forests correct for decision trees' habit of overfitting to their training set.[3]:587–588 https://en.wikipedia.org/wiki/Random_forest
            from sklearn.ensemble import RandomForestClassifier
            selection_object = RandomForestClassifier()
        elif technique == "LogisticRegression":
            from sklearn.linear_model import LogisticRegression
            selection_object = LogisticRegression()
        elif technique == "LinearRegression":
            from sklearn.linear_model import LinearRegression
            selection_object = LinearRegression()
        elif technique == "LightGBM":
            from lightgbm import LGBMClassifier
            selection_object = LGBMClassifier()
        else:
            raise ValueError("Unknown feature selection technique: " +
                             technique)

        start_mem_measurement()
        start = time.time()

        dataset, _ = applyFeatureSelection(
            dataset,
            predictions,
            selection_object,
            n_components,
            mode,
            merged=(len(dataset.shape) == 4 and len(predictions.shape) == 3))

        time_elapse = time.time() - start

        event = 'applying band selection method (SELECTION) ' + technique
        formatted_time = str(timedelta(seconds=time_elapse))
        df_column_entry_dict['Time measurement at ' + event +
                             ' [s]'] = time_elapse

        print("\n" + event + " took " + formatted_time + " seconds\n")

        event = "after applying band selection method " + technique
        stop_mem_measurement(event, df_column_entry_dict)
        print_memory_metrics(event, df_column_entry_dict)

    print("Dataset new shape: " + str(dataset.shape))

    return dataset
예제 #40
0
#Sparse PCA analysis
################################
#Sparse Principal Components Analysis (SparsePCA)
#SparsePCA
"""
Finds the set of sparse components that can optimally reconstruct the data. 
The amount of sparseness is controllable by the coefficient of the L1 penalty, 
given by the parameter alpha.
"""
from sklearn.decomposition import SparsePCA
#SparsePCA(n_components=None, alpha=1, ridge_alpha=0.01, max_iter=1000, tol=1e-08,
#method='lars', n_jobs=1, U_init=None, V_init=None,
#verbose=False, random_state=None)
#method : {‘lars’, ‘cd’}
#alpha: higher value--sparser components
spca = SparsePCA(method='lars')
SPCA_OUTPUT = spca.fit(X_all_his_center)
X_spca = spca.fit_transform(X_all_his_center)
np.savetxt("D:/lly/2017MM/PHASE2/final_totoal/SPCA_MM_PCs.csv",
           SPCA_OUTPUT.components_,
           delimiter=",")

#2d-visualization-SPCA-data projection in higher dimensional space
fig = plt.figure()
plt.plot(X_spca[reds, 0], X_spca[reds, 1], "ro", markersize=10)
plt.plot(X_spca[blues, 0], X_spca[blues, 1], "b^", alpha=0.5)
plt.plot(X_spca[greens, 0], X_spca[greens, 1], "g+")
plt.legend('LWN')
plt.title("newData under two PCs--SPCA")
plt.xlabel("$1^{st}$ PC")
plt.ylabel("$2^{nd}$ PC")
예제 #41
0
print(pca.explained_variance_ratio_)

fig = plt.figure()
fig.suptitle('PCA', fontsize=32)
plt.plot(range(len(pca.explained_variance_ratio_)),
         pca.explained_variance_ratio_)
plt.show()

show_figure(fdata, labels, ulabs, 'PCA')

# Sparse PCA
print('Sparse PCA')
from sklearn.decomposition import SparsePCA

spca = SparsePCA(n_components=3)
fdata = spca.fit_transform(authors)
show_figure(fdata, labels, ulabs, 'Sparse PCA')

# ISOMAP

print('ISOMAP')
from sklearn.manifold import Isomap
iso = Isomap(n_components=3, n_neighbors=7)
fdata = iso.fit_transform(authors)

show_figure(fdata, labels, ulabs, 'ISOMAP')

# LLE
print('LLE')
from sklearn.manifold import LocallyLinearEmbedding
예제 #42
0
파일: spca.py 프로젝트: mikss/sdp-ex
import numpy as np
N = 500
P = 10
MU = [0] * P
T = 1  # spike level
K = 2  # sparsity level
V = list(range(1,K+1)) + [0]*(P-K)
V = V / np.linalg.norm(V)
SIG = np.identity(P) + T * np.matrix(V).transpose() * np.matrix(V)
X = np.matrix(np.random.multivariate_normal(MU,SIG,N))

#####

# using scikit-learn method for Sparse PCA (like an l1-regularized dictionary learning problem)
from sklearn.decomposition import SparsePCA
spca = SparsePCA(n_components=1, alpha=5)
spca.fit(X)

from sklearn.decomposition import PCA
pca = PCA(n_components=1)
pca.fit(X)

print('Classical 1st principal component:', pca.components_)
print('Sparse 1st principal component:', spca.components_)

#####

# TODO: SDP implementation a la El Ghaoui, Bach, D'Aspremont
import cvxopt
# TWO CONSTRAINTS
# trace = 1 (multiply with identity)
예제 #43
0
    if regen_dicts:
        train_trans, test_trans = regression_translation(model=model)
        pickle.dump((train_trans, test_trans), open(filename, 'w+'))
    else:
        train_trans, test_trans = pickle.load(open(filename, 'r'))

    print "training translation with model:{} scored:{}".format(
        model, translation_quality(train_trans))
    print "testing translation with model:{} scored:{}".format(
        model, translation_quality(test_trans))

try_search = False
if try_search:
    dims = [2, 5, 10, 20, 30, None]
    for dim in dims:
        pre_transforms = [
            TruncatedSVD(n_components=dim),
            PCA(n_components=dim),
            # KernelPCA(n_components=dim, kernel='rbf'), #TODO fix memory error / try on larger machine
            SparsePCA(n_components=dim)
        ]  # ,
        # DictionaryLearning(n_components=dim)]

        for pre_transform in pre_transforms:
            embedding_translation = nn_embedding_translate(
                pre_transform=pre_transform)
            print "Translation with pt:{} scored:{}".format(
                pre_transform, translation_quality(embedding_translation))

    print translation_quality(en_2_es)
예제 #44
0
def spca(X, n_comp, random_state):
    spca = SparsePCA(n_components=n_comp, random_state=random_state)
    spca.fit(X)

    return spca
예제 #45
0
        test[c] = lbl.transform(list(test[c].values))

n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
# pca = PCA(n_components=n_comp, random_state=420)
# pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
# pca2_results_test = pca.transform(test)

#sparse PCA
spca = SparsePCA(n_components=n_comp, random_state=420)
spca2_results_train = spca.fit_transform(train.drop(["y"], axis=1))
spca2_results_test = spca.transform(test)

#Kernel PCA
kpca = KernelPCA(n_components=n_comp, random_state=420)
kpca2_results_train = kpca.fit_transform(train.drop(["y"], axis=1))
kpca2_results_test = kpca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
예제 #46
0
def transform(xTrain,yTrain,xTest):
    pca = SparsePCA(n_components=2);
    newXTrain =  pca.fit_transform(xTrain,yTrain)
    newXTest = pca.transform(xTest)
    return newXTrain,newXTest   
예제 #47
0
from sklearn.decomposition import SparsePCA
from sklearn.preprocessing import StandardScaler

#convert RData to pd DataFrame
readin = pyreadr.read_r('C:/Users/TW/Downloads/west.RData')
westdf = readin["west"]
chapter = westdf[['chapter']]

#propossesing the data
westdf = westdf.drop(['chapter'], axis=1)  #delete 'chapter' column   408*302
x = westdf.loc[:, :].values
x = StandardScaler(with_std=False).fit_transform(x)  #centerlize the data

#SparsePCA transform
transformer = SparsePCA(n_components=3,\
                        alpha=0.1,\
                        normalize_components=True,\
                        random_state=0)
x_transformed = transformer.fit_transform(x)

# for data analysis
x_transformed.shape
transformer.alpha
egienvetors = transformer.components_
transformer.error_
transformer.get_params(deep=True)
np.mean(transformer.components_ == 0)
westspca = pd.DataFrame(data=egienvetors, columns=westdf.columns)
Spca1 = westspca.sort_values(by=[0], axis=1)
Spca2 = westspca.sort_values(by=[1], axis=1)
Spca3 = westspca.sort_values(by=[2], axis=1)
Spca4 = westspca.sort_values(by=[3], axis=1)
 'phate':
 lambda args: PHATE(n_components=args.dims, n_jobs=args.njobs)
 if PHATE_AVAILABLE else _embedding_error(),
 'pmf':
 lambda args: NimfaWrapper(nimfa.Pmf, args.dims),
 'psmf':
 lambda args: NimfaWrapper(nimfa.Psmf, args.dims),
 'saucie':
 lambda args: SaucieWrapper(args.dims)
 if SAUCIE_AVAILABLE else _embedding_error(),
 'scscope':
 lambda args: ScScope(args.dims) if SCSCOPE_AVAILABLE else _embedding_error,
 'sepnmf':
 lambda args: NimfaWrapper(nimfa.SepNMF, args.dims),
 'spca':
 lambda args: SparsePCA(
     n_components=args.dims, n_jobs=args.njobs, normalize_components=True),
 'spca-batch':
 lambda args: MiniBatchSparsePCA(
     n_components=args.dims, n_jobs=args.njobs, normalize_components=True),
 'spectral':
 lambda args: SpectralEmbedding(n_components=args.dims, n_jobs=args.njobs),
 'snmf':
 lambda args: NimfaWrapper(nimfa.Snmf, args.dims),
 'srp':
 lambda args: SparseRandomProjection(n_components=args.dims),
 'tga':
 lambda args: TGA(n_components=args.dims)
 if TGA_AVAILABLE else _embedding_error(),
 'tsvd':
 lambda args: TruncatedSVD(n_components=args.dims),
 'tsne':
예제 #49
0
def test_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=0)
    spca_lars.fit(Y)
    U1 = spca_lars.transform(Y)
    # Test multiple CPUs
    if sys.platform == 'win32':  # fake parallelism for win32
        import sklearn.externals.joblib.parallel as joblib_par
        _mp = joblib_par.multiprocessing
        joblib_par.multiprocessing = None
        try:
            spca = SparsePCA(n_components=3, n_jobs=2, random_state=0,
                             alpha=alpha).fit(Y)
            U2 = spca.transform(Y)
        finally:
            joblib_par.multiprocessing = _mp
    else:  # we can efficiently use parallelism
        spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha,
                         random_state=0).fit(Y)
        U2 = spca.transform(Y)
    assert_true(not np.all(spca_lars.components_ == 0))
    assert_array_almost_equal(U1, U2)
    # Test that CD gives similar results
    spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0,
                           alpha=alpha)
    spca_lasso.fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
예제 #50
0
def textSimilarity():
    NeighborDirectory = GEOTEXT_HOME
    # matplotlib.use('Agg')
    DATA_FOLDER = userTextDirectory
    # DATA_FOLDER = "/GEOTEXT_HOME/af/Downloads/review_polarity/txt_sentoken"
    K_FOLD = 10
    data_target = load_files(DATA_FOLDER, encoding=encoding)
    filenames = data_target.filenames
    DO_PCA = True
    DO_SPARSEPCA = False
    Reduction_D = 100
    DO_SVD = False
    categories = data_target.target_names
    DO_NMF = False
    
    def size_mb(docs):
        return sum(len(s.encode(encoding)) for s in docs) / 1e6
    
    data_size_mb = size_mb(data_target.data)
    
    
    print("%d documents - %0.3fMB (all data set)" % (
        len(data_target.data), data_size_mb))
    
    print("%d categories" % len(categories))
    print()
    
    # split a training set and a test set
    target = data_target.target
    
    
    
    print("Extracting features from all the dataset using a sparse vectorizer")
    t0 = 0
    vectorizer = TfidfVectorizer(use_idf=True, norm='l2', binary=False, sublinear_tf=True, min_df=2, max_df=0.2, ngram_range=(1, 1), stop_words='english')
    
    # vectorizer = CountVectorizer(min_df=2, max_df=1.0, ngram_range=(1, 4))
    # the output of the fit_transform (x_train) is a sparse csc matrix.
    data = vectorizer.fit_transform(data_target.data)
    print data.dtype
    data = csr_matrix(data, dtype=float32)
    print data.dtype
    duration = 1
    print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
    print("n_samples: %d, n_features: %d" % data.shape)
    print()
    
    
    
    if DO_PCA:
        print("dimension reduction pca with d=%d" % Reduction_D)
        pca = PCA(n_components=Reduction_D, copy=True, whiten=False)
        print type(data)
        data = pca.fit_transform(data.todense())
    if DO_SPARSEPCA:
        print("dimension reduction sparsepca with d=%d" % Reduction_D)
        spca = SparsePCA(Reduction_D)
        data = spca.fit_transform(data.toarray())
    if DO_SVD:
        print("dimension reduction svd with d=%d" % Reduction_D)
        svd = TruncatedSVD(n_components=Reduction_D, algorithm="randomized", n_iterations=5, random_state=None, tol=0)
        data = svd.fit_transform(data)
    if DO_NMF:
        print("dimension reduction nmf with d=%d" % Reduction_D)
        nmf = NMF(n_components=Reduction_D)
        data = nmf.fit_transform(data)
    
    DO_CHI = False
    if DO_CHI:
        print("Extracting best features by a chi-squared test")
        ch2NumFeatures = 1000 
        ch2 = SelectKBest(chi2, k=ch2NumFeatures)
        # print vectorizer.get_stop_words()
        data = ch2.fit_transform(data, target)
        # print data
    
    
    KNN = 10
    nn = NearestNeighbors(n_neighbors=KNN + 1, algorithm='ball_tree').fit(data)
    # query and data are the same so every node is counted as its most similar here
    distances, indices = nn.kneighbors(data)
    with codecs.open(path.join(NeighborDirectory, 'neighbors.txt'), 'w', encoding) as outf:
        nodeIndex = -1
        nodeNeighbors = []
        for neighbors in indices:
            nodeIndex += 1
            outf.write(path.basename(filenames[nodeIndex]) + ' ')
            for neighbor in neighbors:
                if neighbor == nodeIndex:
                    continue
                else:
                    outf.write(path.basename(filenames[neighbor]) + ' ')
            outf.write('\n')
def _cluster_analysis(feats_data, save_name, is_color_time=False):
    #%% ##### Clustering analysis
    df_n = feats_data.dropna()
    
    df = df_n[set_feats].copy()
    index_data = df_n[index_cols].reset_index()
    
    X = df.values.copy()
    #[x_min==x_max]
    x_min, x_max = df.min(), df.max()
    df = (df - x_min)/(x_max - x_min)
    
    X = df.values
    #%% #### labels and indexes vectors
    nz = int(np.ceil(np.log10(index_data['time_group']+ 0.001).max()))
    time_g_str = [('%1.1f' % x).zfill(nz+2) for x in index_data['time_group'].values]
    cohort_str = [str(int(x)) for x in index_data['cohort_n']]
    
    labels = ['C{}_T{}'.format(*x) for x in zip(cohort_str, time_g_str)]
    label_order = sorted(list(set(labels)))
    
    uC = sorted(list(set(cohort_str)))
    uT = sorted(list(set(time_g_str)))
    filled_markers = ('o', 's', 'v', '^', '<', '>', '8', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X')
    
    if is_color_time:
        cols = sns.color_palette("RdYlGn", len(uT))
        
        col_dict_u = {k : v for k,v in zip(time_g_str, cols)}
        col_dict = {ll : col_dict_u[tt] for ll, tt in zip(labels, time_g_str)}
        
        mks_dict = {x : filled_markers[ii] for ii, x in enumerate(uC)}
        mks = [mks_dict[x[1]] for x in label_order]
    else:
        
        cols = sns.color_palette("colorblind", len(uC))
        col_dict_u = {k : v for k,v in zip(uC, cols)}
        col_dict = {ll : col_dict_u[tt] for ll, tt in zip(labels, cohort_str)}
        
        mks_dict = {x : filled_markers[ii] for ii, x in enumerate(uT)}
        mks = [mks_dict[x.partition('_T')[-1]] for x in label_order]
    
    
    #%%
    tsne = TSNE(n_components=2, 
                    #perplexity = 21,
                    init='pca',
                    verbose=1, 
                    n_iter=10000
                    )
    X_tsne = tsne.fit_transform(X)
    
    #%%
    pca_s = SparsePCA()
    X_pca_s = pca_s.fit_transform(X)
    
    pca = PCA()
    X_pca = pca.fit_transform(X)
    #%%
    
            
    dat = {'t-SNE':X_tsne, 'PCA':X_pca, 'PCA_Sparse':X_pca_s}
    
    with PdfPages(save_name) as pdf_pages:
        
        for k, Xp in dat.items():
            _plot_clusters(Xp, labels, label_order, col_dict, mks)
            plt.title(k)
            pdf_pages.savefig()
            plt.close()
            #%%
    return dat
예제 #52
0
def niftidecomp_workflow(
    decompaxis,
    datafile,
    outputroot,
    datamaskname=None,
    decomptype="pca",
    pcacomponents=0.5,
    icacomponents=None,
    varnorm=True,
    demean=True,
    sigma=0.0,
):

    print(f"Will perform {decomptype} analysis along the {decompaxis} axis")

    if decompaxis == "temporal":
        decompaxisnum = 1
        transposeifspatial = lambda *a, **k: None
    else:
        decompaxisnum = 0
        transposeifspatial = np.transpose

    # save the command line
    tide_io.writevec([" ".join(sys.argv)], outputroot + "_commandline.txt")

    # read in data
    print("reading in data arrays")
    (
        datafile_img,
        datafile_data,
        datafile_hdr,
        datafiledims,
        datafilesizes,
    ) = tide_io.readfromnifti(datafile)

    if datamaskname is not None:
        (
            datamask_img,
            datamask_data,
            datamask_hdr,
            datamaskdims,
            datamasksizes,
        ) = tide_io.readfromnifti(datamaskname)

    xsize, ysize, numslices, timepoints = tide_io.parseniftidims(datafiledims)
    xdim, ydim, slicethickness, tr = tide_io.parseniftisizes(datafilesizes)

    # check dimensions
    if datamaskname is not None:
        print("checking mask dimensions")
        if not tide_io.checkspacedimmatch(datafiledims, datamaskdims):
            print("input mask spatial dimensions do not match image")
            exit()
        if not (tide_io.checktimematch(datafiledims, datamaskdims)
                or datamaskdims[4] == 1):
            print("input mask time dimension does not match image")
            exit()

    # save the command line
    tide_io.writevec([" ".join(sys.argv)], outputroot + "_commandline.txt")

    # smooth the data
    if sigma > 0.0:
        print("smoothing data")
        for i in range(timepoints):
            datafile_data[:, :, :,
                          i] = tide_filt.ssmooth(xdim, ydim, slicethickness,
                                                 sigma, datafile_data[:, :, :,
                                                                      i])

    # allocating arrays
    print("reshaping arrays")
    numspatiallocs = int(xsize) * int(ysize) * int(numslices)
    rs_datafile = datafile_data.reshape((numspatiallocs, timepoints))

    print("masking arrays")
    if datamaskname is not None:
        if datamaskdims[4] == 1:
            proclocs = np.where(datamask_data.reshape(numspatiallocs) > 0.5)
        else:
            proclocs = np.where(
                np.mean(datamask_data.reshape((numspatiallocs, timepoints)),
                        axis=1) > 0.5)
            rs_mask = datamask_data.reshape(
                (numspatiallocs, timepoints))[proclocs, :]
            rs_mask = np.where(rs_mask > 0.5, 1.0, 0.0)[0]
    else:
        datamaskdims = [1, xsize, ysize, numslices, 1]
        themaxes = np.max(rs_datafile, axis=1)
        themins = np.min(rs_datafile, axis=1)
        thediffs = (themaxes - themins).reshape(numspatiallocs)
        proclocs = np.where(thediffs > 0.0)
    procdata = rs_datafile[proclocs, :][0]
    print(rs_datafile.shape, procdata.shape)

    # normalize the individual images
    if demean:
        print("demeaning array")
        themean = np.mean(procdata, axis=decompaxisnum)
        print("shape of mean", themean.shape)
        for i in range(procdata.shape[1 - decompaxisnum]):
            if decompaxisnum == 1:
                procdata[i, :] -= themean[i]
            else:
                procdata[:, i] -= themean[i]
    else:
        themean = np.ones(procdata.shape[1 - decompaxisnum])

    if varnorm:
        print("variance normalizing array")
        thevar = np.var(procdata, axis=decompaxisnum)
        print("shape of var", thevar.shape)
        for i in range(procdata.shape[1 - decompaxisnum]):
            if decompaxisnum == 1:
                procdata[i, :] /= thevar[i]
            else:
                procdata[:, i] /= thevar[i]
        procdata = np.nan_to_num(procdata)
    else:
        thevar = np.ones(procdata.shape[1 - decompaxisnum])

    # applying mask
    if datamaskdims[4] > 1:
        procdata *= rs_mask

    # now perform the decomposition
    if decomptype == "ica":
        print("performing ica decomposition")
        if icacomponents is None:
            print("will return all significant components")
        else:
            print("will return", icacomponents, "components")
        thefit = FastICA(n_components=icacomponents).fit(
            transposeifspatial(procdata))  # Reconstruct signals
        if icacomponents is None:
            thecomponents = transposeifspatial(thefit.components_[:])
            print(thecomponents.shape[1], "components found")
        else:
            thecomponents = transposeifspatial(
                thefit.components_[0:icacomponents])
            print("returning first", thecomponents.shape[1],
                  "components found")
    else:
        print("performing pca decomposition")
        if pcacomponents < 1.0:
            print(
                "will return the components accounting for",
                pcacomponents * 100.0,
                "% of the variance",
            )
        else:
            print("will return", pcacomponents, "components")
        if decomptype == "pca":
            thepca = PCA(n_components=pcacomponents)
        else:
            thepca = SparsePCA(n_components=pcacomponents)
        thefit = thepca.fit(transposeifspatial(procdata))
        thetransform = thepca.transform(transposeifspatial(procdata))
        theinvtrans = transposeifspatial(
            thepca.inverse_transform(thetransform))
        if pcacomponents < 1.0:
            thecomponents = transposeifspatial(thefit.components_[:])
            print("returning", thecomponents.shape[1], "components")
        else:
            thecomponents = transposeifspatial(
                thefit.components_[0:pcacomponents])

        # save the eigenvalues
        print("variance explained by component:",
              100.0 * thefit.explained_variance_ratio_)
        tide_io.writenpvecs(
            100.0 * thefit.explained_variance_ratio_,
            outputroot + "_explained_variance_pct.txt",
        )

        if decompaxis == "temporal":
            # save the components
            print("writing component timecourses")
            tide_io.writenpvecs(thecomponents, outputroot + "_components.txt")

            # save the singular values
            print("writing singular values")
            tide_io.writenpvecs(np.transpose(thesingvals),
                                outputroot + "_singvals.txt")

            # save the coefficients
            print("writing out the coefficients")
            coefficients = thetransform
            print("coefficients shape:", coefficients.shape)
            theheader = datafile_hdr
            theheader["dim"][4] = coefficients.shape[1]
            tempout = np.zeros((numspatiallocs, coefficients.shape[1]),
                               dtype="float")
            tempout[proclocs, :] = coefficients[:, :]
            tide_io.savetonifti(
                tempout.reshape(
                    (xsize, ysize, numslices, coefficients.shape[1])),
                datafile_hdr,
                outputroot + "_coefficients",
            )
            # unnormalize the dimensionality reduced data
            for i in range(numspatiallocs):
                theinvtrans[i, :] = thevar[i] * theinvtrans[i, :] + themean[i]

        else:
            # save the component images
            print("writing component images")
            theheader = datafile_hdr
            theheader["dim"][4] = thecomponents.shape[1]
            tempout = np.zeros((numspatiallocs, thecomponents.shape[1]),
                               dtype="float")
            tempout[proclocs, :] = thecomponents[:, :]
            tide_io.savetonifti(
                tempout.reshape(
                    (xsize, ysize, numslices, thecomponents.shape[1])),
                datafile_hdr,
                outputroot + "_components",
            )

            # save the coefficients
            print("writing out the coefficients")
            coefficients = np.transpose(thetransform)
            tide_io.writenpvecs(coefficients, outputroot + "_coefficients.txt")

            # unnormalize the dimensionality reduced data
            for i in range(timepoints):
                theinvtrans[:, i] = thevar[i] * theinvtrans[:, i] + themean[i]

        print("writing fit data")
        theheader = datafile_hdr
        theheader["dim"][4] = theinvtrans.shape[1]
        tempout = np.zeros((numspatiallocs, theinvtrans.shape[1]),
                           dtype="float")
        tempout[proclocs, :] = theinvtrans[:, :]
        tide_io.savetonifti(
            tempout.reshape((xsize, ysize, numslices, theinvtrans.shape[1])),
            datafile_hdr,
            outputroot + "_fit",
        )
예제 #53
0
cnt=0
feature=[[0 for i in range(0,n_feat)] for j in range(0,120542)] #80362
for line in fin:
    a=line.split(" ")
    for i in range(2,n_feat):
        feature[cnt][i-2]=float(a[i].split(":")[1])
    cnt+=1
print cnt
#print feature[cnt-1]

X=np.array(feature)
'''
pca=PCA(n_components=n_feat)
pca_result=pca.fit_transform(X)
'''
pca=SparsePCA(n_components=n_feat,alpha=0.6,n_jobs=2,max_iter=15)
pca_result=pca.fit_transform(X)

#print pca_result[0]
cnt=0
fin = open("data/feature/train_gh_97a",'r')

for line in fin:
    a=line.split(" ")
    PCA_d=50
    for i in range(0,PCA_d):
        a[i+2]=str(i)+":"+str(feature[cnt][i])
    ll=" ".join(a[0:PCA_d+2])
    fo.write(ll+"\n")
    cnt+=1
fo.close()
예제 #54
0
		for f in cats:
			count += 1
			if count > n:
				break
			try:
				cat = io.imread("sparse-cats/"+f,as_grey=True).flatten()
				cat.shape = (40000,1)
				images = np.append(images, cat, axis=1)
			except:
				count -= 1
				continue
		print("loaded cats...")

		tic = time.clock()
		print("starting learning...")
		pca = SparsePCA(n_components=n,max_iter=1000)
		x = pca.fit_transform(images,subject)
		print("learning done...")
		toc = time.clock()
		print(x)

		out = np.zeros(40000)
		print("starting transform...")
		for i in range(40000):
			for j in range(n):
				#out[i] += (x[i,j])
				out[i] += (images[i,j] * x[i,j])

		out.shape = (200,200)
		print(out)
		name = re.match("people/([a-z]*)_small.jpg",filename).group(1)
예제 #55
0
 def fit(self, dif_df):
     factorization = SparsePCA(n_components=self.n_components, alpha=0.03)
     X = dif_df.values[1:]
     self.ticker_symbols_used = dif_df.columns.values
     factorization.fit(X)
     self.factorization = factorization
예제 #56
0
class SPCA(object):
    """
    Wrapper for sklearn package.  Performs sparse PCA

    SPCA has 5 methods:
       - fit(waveforms)
       update class instance with ICA fit

       - fit_transform()
       do what fit() does, but additionally return the projection onto ICA space

       - inverse_transform(A)
       inverses the decomposition, returns waveforms for an input A, using Z

       - get_basis()
       returns the basis vectors Z^\dagger

       - get_params()
       returns metadata used for fits.
    """
    def __init__(self, num_components=10,
                 catalog_name='unknown',
                 alpha = 0.1,
                 ridge_alpha = 0.01,
                 max_iter = 2000,
                 tol = 1e-9,
                 n_jobs = 1,
                 random_state = None):

        self._decomposition  = 'Sparse PCA'
        self._num_components = num_components
        self._catalog_name   = catalog_name
        self._alpha          = alpha
        self._ridge_alpha    = ridge_alpha
        self._n_jobs         = n_jobs
        self._max_iter       = max_iter
        self._tol            = tol
        self._random_state   = random_state

        self._SPCA = SparsePCA(n_components=self._num_components,
                              alpha        = self._alpha,
                              ridge_alpha  = self._ridge_alpha,
                              n_jobs       = self._n_jobs,
                              max_iter     = self._max_iter,
                              tol          = self._tol,
                              random_state = self._random_state)

    def fit(self,waveforms):
        # TODO make sure there are more columns than rows (transpose if not)
        # normalize waveforms
        self._waveforms = waveforms
        self._SPCA.fit(self._waveforms)

    def fit_transform(self,waveforms):
        # TODO make sure there are more columns than rows (transpose if not)
        # normalize waveforms
        self._waveforms = waveforms
        self._A = self._SPCA.fit_transform(self._waveforms)
        return self._A

    def inverse_transform(self,A):
        # convert basis back to waveforms using fit
        new_waveforms = self._SPCA.inverse_transform(A)
        return new_waveforms

    def get_params(self):
        # TODO know what catalog was used! (include waveform metadata)
        params = self._SPCA.get_params()
        params['num_components'] = params.pop('n_components')
        params['Decompositon'] = self._decomposition
        return params

    def get_basis(self):
        """ Return the SPCA basis vectors (Z^\dagger)"""
        Zt = self._SPCA.components_
        return Zt
예제 #57
0
res_poly = compare_KernelPCA(kernel='poly')
res_rbf = compare_KernelPCA(kernel='rbf')
res_sigmoid = compare_KernelPCA(kernel='sigmoid')
res_cosine = compare_KernelPCA(kernel='cosine')


kernel_pca_precomputed = KernelPCA(n_components=kernel_pca_n_comp, kernel='precomputed')
kernel_pca_precomputed_data = kernel_pca_precomputed.fit_transform(data.dot(data.T))
kernel_pca_precomputed.lambdas_.round(3)

# ---
# ## Модификации метода главных компонент
# ### SparcePCA

sparse_pca_lars = SparsePCA(2, method='lars')
sparse_pca_lars_data = sparse_pca_lars.fit_transform(data)

print("Sparse PCA with lars method components")
print(sparse_pca_lars.components_)


sparse_pca_cd = SparsePCA(2, method='cd')
sparse_pca_cd_data = sparse_pca_cd.fit_transform(data)

print("Sparse PCA with cd method components")
print(sparse_pca_cd.components_)


fig, axs = plt.subplots(1,2)
fig.set_figwidth(11)
def main():
    accounts = csv_to_dict('accounts.csv', 0, cast_evals=[str, read_time, readOutcome], type="account")
    account_nodes = csv_to_dict('nodevisits.csv', 1, cast_evals=[str, str, read_time, str], type="node")
    account_submissions = csv_to_dict('submissions.csv', 1, cast_evals=[str, str, read_time, str, str], type="submission")

    account_visits = account_nodes
    for acc in account_visits:
        account_visits[acc].extend(account_submissions[acc])
        account_visits[acc] = sorted(account_visits[acc], key=lambda k: k['time'])
    session_length(account_visits)

    #Build sessions based on time scale determined from previous code as 15 minutes
    sessions = []
    for acc in account_visits:
        actions = []
        for idx, visit in enumerate(account_visits[acc]):
            if idx == 0:
                actions = {"node": [], "submission": [], "learning_outcome": accounts[acc][0]["learning_outcome"]}
                actions[account_visits[acc][idx]["type"]].append(visit)
            else:
                #Time between visits in minutes
                delta_time = delta_minutes(visit["time"], account_visits[acc][idx-1]["time"])
                #New session, defined as 15 minutes from above
                if delta_time > 15:
                    sessions.append(actions)
                    actions = {"node": [], "submission": [], "learning_outcome": accounts[acc][0]["learning_outcome"]}
                    actions[account_visits[acc][idx]["type"]].append(visit)

                else:
                    actions[account_visits[acc][idx]["type"]].append(visit)
        sessions.append(actions)

    for session in sessions:
        if len(session["node"]) > 0 and len(session["submission"]) > 0:
            session["start_time"] = min(session["node"][0]["time"], session["submission"][0]["time"])
            session["end_time"] = max(session["node"] [len(session["node"]) -1]["time"], session["submission"] [len(session["submission"]) -1]["time"])
        elif len(session["node"]) > 0:
            session["start_time"] = session["node"][0]["time"]
            session["end_time"] = session["node"] [len(session["submission"]) -1]["time"]
        else:
            session["start_time"] = session["submission"][0]["time"]
            session["end_time"] = session["submission"] [len(session["submission"]) -1]["time"]

    #Remove sessions without any time difference or no nodes visited
    sessions = [session for session in sessions if delta_minutes(session["end_time"], session["start_time"]) != 0]

    X = session_properties(sessions)
    X = standardize(X)
    pca = SparsePCA(n_components = 2)
    #Negative one just makes plot easier to look at, PCA is sign insensitive so no real effect
    X_r = -1 * pca.fit(X).transform(X)

    kmeans = cluster.KMeans(n_clusters=4)
    group = kmeans.fit_predict(X_r)

    fig = plt.figure(figsize=(6,6))
    ax = fig.add_subplot(111)
    plt.rc('font', family='serif', size=20)
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.scatter(X_r[:,0], X_r[:,1],s=20,marker = 'o', c=group)
    plt.show()

    outcomes = np.asarray([session["learning_outcome"] for session in sessions])
    session_by_outcome = []
    tags = []
    labels = get_labels(X_r, group, 4)
    for result in range(0, 4):
        session_by_outcome.append(group[outcomes == result])
        if result == 0:
            tags.append("No certificate achieved")
        else:
            tags.append("Mastery Level = " + str(result))

    plot_hist(session_by_outcome, x_min = 0, x_max = 4, y_min = 0, y_max = 1, bins = 4, tags = tags, y_label = "Fraction of sessions", labels=labels)