Пример #1
0
def test_mini_batch_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = MiniBatchSparsePCA(n_components=3, random_state=0,
                                   alpha=alpha).fit(Y)
    U1 = spca_lars.transform(Y)
    # Test multiple CPUs
    if sys.platform == 'win32':  # fake parallelism for win32
        import joblib
        _mp = joblib.parallel.multiprocessing
        joblib.parallel.multiprocessing = None
        try:
            spca = MiniBatchSparsePCA(n_components=3,
                                      n_jobs=2,
                                      alpha=alpha,
                                      random_state=0)
            U2 = spca.fit(Y).transform(Y)
        finally:
            joblib.parallel.multiprocessing = _mp
    else:  # we can efficiently use parallelism
        spca = MiniBatchSparsePCA(n_components=3,
                                  n_jobs=2,
                                  alpha=alpha,
                                  random_state=0)
        U2 = spca.fit(Y).transform(Y)
    assert not np.all(spca_lars.components_ == 0)
    assert_array_almost_equal(U1, U2)
    # Test that CD gives similar results
    spca_lasso = MiniBatchSparsePCA(n_components=3,
                                    method='cd',
                                    alpha=alpha,
                                    random_state=0).fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
Пример #2
0
def test_mini_batch_fit_transform():
    raise SkipTest
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)  # wide array
    spca_lars = MiniBatchSparsePCA(n_components=3, random_state=0,
                                   alpha=alpha).fit(Y)
    U1 = spca_lars.transform(Y)
    # Test multiple CPUs
    if sys.platform == 'win32':  # fake parallelism for win32
        import sklearn.externals.joblib.parallel as joblib_par
        _mp = joblib_par.multiprocessing
        joblib_par.multiprocessing = None
        try:
            U2 = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha,
                                    random_state=0).fit(Y).transform(Y)
        finally:
            joblib_par.multiprocessing = _mp
    else:  # we can efficiently use parallelism
        U2 = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha,
                                random_state=0).fit(Y).transform(Y)
    assert_true(not np.all(spca_lars.components_ == 0))
    assert_array_almost_equal(U1, U2)
    # Test that CD gives similar results
    spca_lasso = MiniBatchSparsePCA(n_components=3, method='cd', alpha=alpha,
                                    random_state=0).fit(Y)
    assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
Пример #3
0
class MiniBatchSparsePCAImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
def batch_minibatch_sparse_pca(scaled_split_dfs, n_components, batch=50):
    ''' Performs minibatch sparse pca for each subset in dictionary of x and
    y train, and x and y test. Number of resulting components is set by
    n_components. For best results, n_compnents should be smaller than
    the number of samples. Batch determines how many features are analyzed at a
    time. Returns two dictionaries, one with the sparse pca
    features an done with information about the sparse pca done. '''
    sparse_pca_dfs = copy.deepcopy(scaled_split_dfs)
    sparse_mb_pca = MiniBatchSparsePCA(n_components=n_components,
                                       batch_size=batch,
                                       random_state=0)
    sparse_pca_ncomponents = {}
    sparse_pca_stats = {}
    for key in sparse_pca_dfs:
        sparse_mb_pca.fit(sparse_pca_dfs[key]['x_train'])
        sparse_pca_x_train = sparse_mb_pca.transform(
            sparse_pca_dfs[key]['x_train'])
        sparse_pca_dfs[key]['x_train'] = sparse_pca_x_train
        sparse_pca_x_test = sparse_mb_pca.transform(
            scaled_split_dfs[key]['x_test'])
        sparse_pca_dfs[key]['x_test'] = sparse_pca_x_test
        sparse_pca_ncomponents[key] = sparse_pca_x_train.shape[1]
    sparse_pca_stats['ncomponents'] = sparse_pca_ncomponents
    return sparse_pca_dfs, sparse_pca_stats
Пример #5
0
class MBSPCA:
    def __init__(self, rfe_cv, *args, **kwargs):
        self.rfe = None
        self.rfe_cv = rfe_cv
        self.model = MiniBatchSparsePCA(*args, **kwargs)

    def fit(self, X, y):
        Z = numpy.concatenate([X, y.reshape(-1, 1)], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        X_, y_ = X[~pandas.isna(Z).any(axis=1), :], y[~pandas.isna(Z).any(
            axis=1)]
        if Z.shape[0] != X.shape[0]:
            print(
                'FIT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'
                .format(X.shape[0] - X_.shape[0]))
        if self.rfe_cv:
            raise Exception("PCA could not be processed with RFE_CV")
        else:
            self.model.fit(X_)

    def predict(self, X):
        Z = numpy.concatenate([X], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        nan_mask = ~pandas.isna(Z).any(axis=1)
        X_ = X[nan_mask, :]
        if Z.shape[0] != X.shape[0]:
            print(
                'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'
                .format(X.shape[0] - X_.shape[0]))
        if self.rfe_cv:
            raise Exception("PCA could not be processed with RFE_CV")
        else:
            predicted = self.model.transform(X_)
            Z = numpy.full(shape=(X.shape[0], predicted.shape[1]),
                           fill_value=numpy.nan,
                           dtype=numpy.float64)
            Z[nan_mask, :] = predicted
        return Z
                        desc="load example users..."):
            if ret is None:
                continue
            idx, vec = ret
            for term_idx, weight in vec.items():
                mtx[idx, term_idx] = weight

    print(f"[{FILE}] start to train TruncatedSVD...")
    transformer = MiniBatchSparsePCA(n_components=500,
                                     batch_size=100,
                                     random_state=0)
    transformer.fit(mtx.todense())
    elapsed_time = time.time() - start_time
    print(f"[{FILE}] elapsed_time = {elapsed_time}")
    print(f"[{FILE}] start to transform matrix...")
    X_transformed = transformer.transform(mtx[:5000])
    print(X_transformed)
    print(X_transformed.shape)
    print(type(X_transformed))
    joblib.dump(transformer, f"{TOP_DIR}/var/transformer.joblib")

if "--transform" in sys.argv:
    transformer = joblib.load(f"{TOP_DIR}/var/transformer.joblib")
    """ 1000個づつ分割 """
    filenames = glob.glob(f"{HOME}/var/user_vectors/*")
    args = []
    STEP = 2000
    for i in range(0, len(filenames), STEP):
        args.append((i, filenames[i:i + STEP]))

    Path(f"{TOP_DIR}/tmp/data_svd").mkdir(exist_ok=True, parents=True)