Пример #1
0
def test_mini_batch_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = MiniBatchSparsePCA(n_components=3, random_state=0,
                                   alpha=alpha).fit(Y)
    U1 = spca_lars.transform(Y)
    # Test multiple CPUs
    if sys.platform == 'win32':  # fake parallelism for win32
        import joblib
        _mp = joblib.parallel.multiprocessing
        joblib.parallel.multiprocessing = None
        try:
            spca = MiniBatchSparsePCA(n_components=3,
                                      n_jobs=2,
                                      alpha=alpha,
                                      random_state=0)
            U2 = spca.fit(Y).transform(Y)
        finally:
            joblib.parallel.multiprocessing = _mp
    else:  # we can efficiently use parallelism
        spca = MiniBatchSparsePCA(n_components=3,
                                  n_jobs=2,
                                  alpha=alpha,
                                  random_state=0)
        U2 = spca.fit(Y).transform(Y)
    assert not np.all(spca_lars.components_ == 0)
    assert_array_almost_equal(U1, U2)
    # Test that CD gives similar results
    spca_lasso = MiniBatchSparsePCA(n_components=3,
                                    method='cd',
                                    alpha=alpha,
                                    random_state=0).fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
Пример #2
0
def test_mini_batch_fit_transform(norm_comp):
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = MiniBatchSparsePCA(n_components=3, random_state=0,
                                   alpha=alpha,
                                   normalize_components=norm_comp).fit(Y)
    U1 = spca_lars.transform(Y)
    # Test multiple CPUs
    if sys.platform == 'win32':  # fake parallelism for win32
        import sklearn.utils._joblib.parallel as joblib_par
        _mp = joblib_par.multiprocessing
        joblib_par.multiprocessing = None
        try:
            spca = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha,
                                      random_state=0,
                                      normalize_components=norm_comp)
            U2 = spca.fit(Y).transform(Y)
        finally:
            joblib_par.multiprocessing = _mp
    else:  # we can efficiently use parallelism
        spca = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha,
                                  random_state=0,
                                  normalize_components=norm_comp)
        U2 = spca.fit(Y).transform(Y)
    assert_true(not np.all(spca_lars.components_ == 0))
    assert_array_almost_equal(U1, U2)
    # Test that CD gives similar results
    spca_lasso = MiniBatchSparsePCA(n_components=3, method='cd', alpha=alpha,
                                    random_state=0,
                                    normalize_components=norm_comp).fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
Пример #3
0
class MiniBatchSparsePCAImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Пример #4
0
class MBSPCA:
    def __init__(self, rfe_cv, *args, **kwargs):
        self.rfe = None
        self.rfe_cv = rfe_cv
        self.model = MiniBatchSparsePCA(*args, **kwargs)

    def fit(self, X, y):
        Z = numpy.concatenate([X, y.reshape(-1, 1)], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        X_, y_ = X[~pandas.isna(Z).any(axis=1), :], y[~pandas.isna(Z).any(
            axis=1)]
        if Z.shape[0] != X.shape[0]:
            print(
                'FIT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'
                .format(X.shape[0] - X_.shape[0]))
        if self.rfe_cv:
            raise Exception("PCA could not be processed with RFE_CV")
        else:
            self.model.fit(X_)

    def predict(self, X):
        Z = numpy.concatenate([X], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        nan_mask = ~pandas.isna(Z).any(axis=1)
        X_ = X[nan_mask, :]
        if Z.shape[0] != X.shape[0]:
            print(
                'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'
                .format(X.shape[0] - X_.shape[0]))
        if self.rfe_cv:
            raise Exception("PCA could not be processed with RFE_CV")
        else:
            predicted = self.model.transform(X_)
            Z = numpy.full(shape=(X.shape[0], predicted.shape[1]),
                           fill_value=numpy.nan,
                           dtype=numpy.float64)
            Z[nan_mask, :] = predicted
        return Z
def batch_minibatch_sparse_pca(scaled_split_dfs, n_components, batch=50):
    ''' Performs minibatch sparse pca for each subset in dictionary of x and
    y train, and x and y test. Number of resulting components is set by
    n_components. For best results, n_compnents should be smaller than
    the number of samples. Batch determines how many features are analyzed at a
    time. Returns two dictionaries, one with the sparse pca
    features an done with information about the sparse pca done. '''
    sparse_pca_dfs = copy.deepcopy(scaled_split_dfs)
    sparse_mb_pca = MiniBatchSparsePCA(n_components=n_components,
                                       batch_size=batch,
                                       random_state=0)
    sparse_pca_ncomponents = {}
    sparse_pca_stats = {}
    for key in sparse_pca_dfs:
        sparse_mb_pca.fit(sparse_pca_dfs[key]['x_train'])
        sparse_pca_x_train = sparse_mb_pca.transform(
            sparse_pca_dfs[key]['x_train'])
        sparse_pca_dfs[key]['x_train'] = sparse_pca_x_train
        sparse_pca_x_test = sparse_mb_pca.transform(
            scaled_split_dfs[key]['x_test'])
        sparse_pca_dfs[key]['x_test'] = sparse_pca_x_test
        sparse_pca_ncomponents[key] = sparse_pca_x_train.shape[1]
    sparse_pca_stats['ncomponents'] = sparse_pca_ncomponents
    return sparse_pca_dfs, sparse_pca_stats
        args.append((idx, filename))
    with ProcessPoolExecutor(max_workers=psutil.cpu_count()) as exe:
        for ret in tqdm(exe.map(load, args),
                        total=len(args),
                        desc="load example users..."):
            if ret is None:
                continue
            idx, vec = ret
            for term_idx, weight in vec.items():
                mtx[idx, term_idx] = weight

    print(f"[{FILE}] start to train TruncatedSVD...")
    transformer = MiniBatchSparsePCA(n_components=500,
                                     batch_size=100,
                                     random_state=0)
    transformer.fit(mtx.todense())
    elapsed_time = time.time() - start_time
    print(f"[{FILE}] elapsed_time = {elapsed_time}")
    print(f"[{FILE}] start to transform matrix...")
    X_transformed = transformer.transform(mtx[:5000])
    print(X_transformed)
    print(X_transformed.shape)
    print(type(X_transformed))
    joblib.dump(transformer, f"{TOP_DIR}/var/transformer.joblib")

if "--transform" in sys.argv:
    transformer = joblib.load(f"{TOP_DIR}/var/transformer.joblib")
    """ 1000個づつ分割 """
    filenames = glob.glob(f"{HOME}/var/user_vectors/*")
    args = []
    STEP = 2000
Пример #7
0
clf = IsolationForest(random_state=0, n_jobs=-1, contamination=0.25).fit(X)

A = clf.predict(X)

print((A == -1).mean(), (labels != 0).mean(),
      ((A == -1) == (labels != 0)).mean())

#%%
from sklearn.decomposition import MiniBatchSparsePCA
X = data_pts_1
mbsp = MiniBatchSparsePCA(n_components=20,
                          alpha=1,
                          ridge_alpha=0.01,
                          batch_size=4,
                          n_jobs=-1)
mbsp.fit(X)
#X_transformed = transformer.transform(X)
# X_transformed.shape

# plt.plot(mbsp.components_[0,:]); plt.show()

#%%
X = data_pts_1

from sklearn import decomposition

mbdl = decomposition.MiniBatchDictionaryLearning(n_jobs=-1,
                                                 n_components=20,
                                                 alpha=0.1,
                                                 n_iter=200,
                                                 batch_size=5,
Пример #8
0
    wavelims = (4000,5700)

    # do PCA
    X, V, Z, Xs_hat, X_hat, wavelengths, ev, n = do_PCA(dir, wavelims, n_pcs)

    # get residuals
    X_residual = X - X_hat
    means, stds = np.mean(X_residual, axis=0), np.std(X_residual, axis=0)
    #Xs_residual = (X_residual - means)/stds
    Xs_residual = X_residual - means

    # sparse PCA
    print "starting sparse PCA"
    for a in alpha:
        spca = MiniBatchSparsePCA(n_components=n_spcs, alpha=a)
        spca.fit(Xs_residual)

        f, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, sharex=True)
        for i in range(len(V)):
            ax1.plot(wavelengths, V[i,:], ',', alpha=1-0.1*i, label='pc %d'%i)
        for i in range(5):
            ax2.plot(wavelengths,spca.components_[i,:], ',', alpha=1-0.1*i, label='sparse pc %d'%i)
        for i in range(5):
            ax3.plot(wavelengths,spca.components_[i+5,:], ',', alpha=1-0.1*i, label='sparse pc %d'%i)
        for i in range(5):
            ax4.plot(wavelengths,spca.components_[i+10,:], ',', alpha=1-0.1*i, label='sparse pc %d'%i)
        ax1.legend(fontsize=10)
        ax2.legend(fontsize=10)
        ax4.set_xlabel('wavelength')
        ax1.set_ylabel('eigenvector values')
        ax2.set_ylabel('eigenvector values')