def test_mini_batch_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = MiniBatchSparsePCA(n_components=3, random_state=0, alpha=alpha).fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs if sys.platform == 'win32': # fake parallelism for win32 import joblib _mp = joblib.parallel.multiprocessing joblib.parallel.multiprocessing = None try: spca = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha, random_state=0) U2 = spca.fit(Y).transform(Y) finally: joblib.parallel.multiprocessing = _mp else: # we can efficiently use parallelism spca = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha, random_state=0) U2 = spca.fit(Y).transform(Y) assert not np.all(spca_lars.components_ == 0) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = MiniBatchSparsePCA(n_components=3, method='cd', alpha=alpha, random_state=0).fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_mini_batch_fit_transform(): raise SkipTest alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = MiniBatchSparsePCA(n_components=3, random_state=0, alpha=alpha).fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs if sys.platform == 'win32': # fake parallelism for win32 import sklearn.externals.joblib.parallel as joblib_par _mp = joblib_par.multiprocessing joblib_par.multiprocessing = None try: U2 = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha, random_state=0).fit(Y).transform(Y) finally: joblib_par.multiprocessing = _mp else: # we can efficiently use parallelism U2 = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha, random_state=0).fit(Y).transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = MiniBatchSparsePCA(n_components=3, method='cd', alpha=alpha, random_state=0).fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
class MiniBatchSparsePCAImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def batch_minibatch_sparse_pca(scaled_split_dfs, n_components, batch=50): ''' Performs minibatch sparse pca for each subset in dictionary of x and y train, and x and y test. Number of resulting components is set by n_components. For best results, n_compnents should be smaller than the number of samples. Batch determines how many features are analyzed at a time. Returns two dictionaries, one with the sparse pca features an done with information about the sparse pca done. ''' sparse_pca_dfs = copy.deepcopy(scaled_split_dfs) sparse_mb_pca = MiniBatchSparsePCA(n_components=n_components, batch_size=batch, random_state=0) sparse_pca_ncomponents = {} sparse_pca_stats = {} for key in sparse_pca_dfs: sparse_mb_pca.fit(sparse_pca_dfs[key]['x_train']) sparse_pca_x_train = sparse_mb_pca.transform( sparse_pca_dfs[key]['x_train']) sparse_pca_dfs[key]['x_train'] = sparse_pca_x_train sparse_pca_x_test = sparse_mb_pca.transform( scaled_split_dfs[key]['x_test']) sparse_pca_dfs[key]['x_test'] = sparse_pca_x_test sparse_pca_ncomponents[key] = sparse_pca_x_train.shape[1] sparse_pca_stats['ncomponents'] = sparse_pca_ncomponents return sparse_pca_dfs, sparse_pca_stats
class MBSPCA: def __init__(self, rfe_cv, *args, **kwargs): self.rfe = None self.rfe_cv = rfe_cv self.model = MiniBatchSparsePCA(*args, **kwargs) def fit(self, X, y): Z = numpy.concatenate([X, y.reshape(-1, 1)], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan X_, y_ = X[~pandas.isna(Z).any(axis=1), :], y[~pandas.isna(Z).any( axis=1)] if Z.shape[0] != X.shape[0]: print( 'FIT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: raise Exception("PCA could not be processed with RFE_CV") else: self.model.fit(X_) def predict(self, X): Z = numpy.concatenate([X], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan nan_mask = ~pandas.isna(Z).any(axis=1) X_ = X[nan_mask, :] if Z.shape[0] != X.shape[0]: print( 'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: raise Exception("PCA could not be processed with RFE_CV") else: predicted = self.model.transform(X_) Z = numpy.full(shape=(X.shape[0], predicted.shape[1]), fill_value=numpy.nan, dtype=numpy.float64) Z[nan_mask, :] = predicted return Z
desc="load example users..."): if ret is None: continue idx, vec = ret for term_idx, weight in vec.items(): mtx[idx, term_idx] = weight print(f"[{FILE}] start to train TruncatedSVD...") transformer = MiniBatchSparsePCA(n_components=500, batch_size=100, random_state=0) transformer.fit(mtx.todense()) elapsed_time = time.time() - start_time print(f"[{FILE}] elapsed_time = {elapsed_time}") print(f"[{FILE}] start to transform matrix...") X_transformed = transformer.transform(mtx[:5000]) print(X_transformed) print(X_transformed.shape) print(type(X_transformed)) joblib.dump(transformer, f"{TOP_DIR}/var/transformer.joblib") if "--transform" in sys.argv: transformer = joblib.load(f"{TOP_DIR}/var/transformer.joblib") """ 1000個づつ分割 """ filenames = glob.glob(f"{HOME}/var/user_vectors/*") args = [] STEP = 2000 for i in range(0, len(filenames), STEP): args.append((i, filenames[i:i + STEP])) Path(f"{TOP_DIR}/tmp/data_svd").mkdir(exist_ok=True, parents=True)