def test_mini_batch_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = MiniBatchSparsePCA(n_components=3, random_state=0, alpha=alpha).fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs if sys.platform == 'win32': # fake parallelism for win32 import joblib _mp = joblib.parallel.multiprocessing joblib.parallel.multiprocessing = None try: spca = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha, random_state=0) U2 = spca.fit(Y).transform(Y) finally: joblib.parallel.multiprocessing = _mp else: # we can efficiently use parallelism spca = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha, random_state=0) U2 = spca.fit(Y).transform(Y) assert not np.all(spca_lars.components_ == 0) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = MiniBatchSparsePCA(n_components=3, method='cd', alpha=alpha, random_state=0).fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_mini_batch_fit_transform(norm_comp): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = MiniBatchSparsePCA(n_components=3, random_state=0, alpha=alpha, normalize_components=norm_comp).fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs if sys.platform == 'win32': # fake parallelism for win32 import sklearn.utils._joblib.parallel as joblib_par _mp = joblib_par.multiprocessing joblib_par.multiprocessing = None try: spca = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha, random_state=0, normalize_components=norm_comp) U2 = spca.fit(Y).transform(Y) finally: joblib_par.multiprocessing = _mp else: # we can efficiently use parallelism spca = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha, random_state=0, normalize_components=norm_comp) U2 = spca.fit(Y).transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = MiniBatchSparsePCA(n_components=3, method='cd', alpha=alpha, random_state=0, normalize_components=norm_comp).fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
class MiniBatchSparsePCAImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
class MBSPCA: def __init__(self, rfe_cv, *args, **kwargs): self.rfe = None self.rfe_cv = rfe_cv self.model = MiniBatchSparsePCA(*args, **kwargs) def fit(self, X, y): Z = numpy.concatenate([X, y.reshape(-1, 1)], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan X_, y_ = X[~pandas.isna(Z).any(axis=1), :], y[~pandas.isna(Z).any( axis=1)] if Z.shape[0] != X.shape[0]: print( 'FIT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: raise Exception("PCA could not be processed with RFE_CV") else: self.model.fit(X_) def predict(self, X): Z = numpy.concatenate([X], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan nan_mask = ~pandas.isna(Z).any(axis=1) X_ = X[nan_mask, :] if Z.shape[0] != X.shape[0]: print( 'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: raise Exception("PCA could not be processed with RFE_CV") else: predicted = self.model.transform(X_) Z = numpy.full(shape=(X.shape[0], predicted.shape[1]), fill_value=numpy.nan, dtype=numpy.float64) Z[nan_mask, :] = predicted return Z
def batch_minibatch_sparse_pca(scaled_split_dfs, n_components, batch=50): ''' Performs minibatch sparse pca for each subset in dictionary of x and y train, and x and y test. Number of resulting components is set by n_components. For best results, n_compnents should be smaller than the number of samples. Batch determines how many features are analyzed at a time. Returns two dictionaries, one with the sparse pca features an done with information about the sparse pca done. ''' sparse_pca_dfs = copy.deepcopy(scaled_split_dfs) sparse_mb_pca = MiniBatchSparsePCA(n_components=n_components, batch_size=batch, random_state=0) sparse_pca_ncomponents = {} sparse_pca_stats = {} for key in sparse_pca_dfs: sparse_mb_pca.fit(sparse_pca_dfs[key]['x_train']) sparse_pca_x_train = sparse_mb_pca.transform( sparse_pca_dfs[key]['x_train']) sparse_pca_dfs[key]['x_train'] = sparse_pca_x_train sparse_pca_x_test = sparse_mb_pca.transform( scaled_split_dfs[key]['x_test']) sparse_pca_dfs[key]['x_test'] = sparse_pca_x_test sparse_pca_ncomponents[key] = sparse_pca_x_train.shape[1] sparse_pca_stats['ncomponents'] = sparse_pca_ncomponents return sparse_pca_dfs, sparse_pca_stats
args.append((idx, filename)) with ProcessPoolExecutor(max_workers=psutil.cpu_count()) as exe: for ret in tqdm(exe.map(load, args), total=len(args), desc="load example users..."): if ret is None: continue idx, vec = ret for term_idx, weight in vec.items(): mtx[idx, term_idx] = weight print(f"[{FILE}] start to train TruncatedSVD...") transformer = MiniBatchSparsePCA(n_components=500, batch_size=100, random_state=0) transformer.fit(mtx.todense()) elapsed_time = time.time() - start_time print(f"[{FILE}] elapsed_time = {elapsed_time}") print(f"[{FILE}] start to transform matrix...") X_transformed = transformer.transform(mtx[:5000]) print(X_transformed) print(X_transformed.shape) print(type(X_transformed)) joblib.dump(transformer, f"{TOP_DIR}/var/transformer.joblib") if "--transform" in sys.argv: transformer = joblib.load(f"{TOP_DIR}/var/transformer.joblib") """ 1000個づつ分割 """ filenames = glob.glob(f"{HOME}/var/user_vectors/*") args = [] STEP = 2000
clf = IsolationForest(random_state=0, n_jobs=-1, contamination=0.25).fit(X) A = clf.predict(X) print((A == -1).mean(), (labels != 0).mean(), ((A == -1) == (labels != 0)).mean()) #%% from sklearn.decomposition import MiniBatchSparsePCA X = data_pts_1 mbsp = MiniBatchSparsePCA(n_components=20, alpha=1, ridge_alpha=0.01, batch_size=4, n_jobs=-1) mbsp.fit(X) #X_transformed = transformer.transform(X) # X_transformed.shape # plt.plot(mbsp.components_[0,:]); plt.show() #%% X = data_pts_1 from sklearn import decomposition mbdl = decomposition.MiniBatchDictionaryLearning(n_jobs=-1, n_components=20, alpha=0.1, n_iter=200, batch_size=5,
wavelims = (4000,5700) # do PCA X, V, Z, Xs_hat, X_hat, wavelengths, ev, n = do_PCA(dir, wavelims, n_pcs) # get residuals X_residual = X - X_hat means, stds = np.mean(X_residual, axis=0), np.std(X_residual, axis=0) #Xs_residual = (X_residual - means)/stds Xs_residual = X_residual - means # sparse PCA print "starting sparse PCA" for a in alpha: spca = MiniBatchSparsePCA(n_components=n_spcs, alpha=a) spca.fit(Xs_residual) f, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, sharex=True) for i in range(len(V)): ax1.plot(wavelengths, V[i,:], ',', alpha=1-0.1*i, label='pc %d'%i) for i in range(5): ax2.plot(wavelengths,spca.components_[i,:], ',', alpha=1-0.1*i, label='sparse pc %d'%i) for i in range(5): ax3.plot(wavelengths,spca.components_[i+5,:], ',', alpha=1-0.1*i, label='sparse pc %d'%i) for i in range(5): ax4.plot(wavelengths,spca.components_[i+10,:], ',', alpha=1-0.1*i, label='sparse pc %d'%i) ax1.legend(fontsize=10) ax2.legend(fontsize=10) ax4.set_xlabel('wavelength') ax1.set_ylabel('eigenvector values') ax2.set_ylabel('eigenvector values')