def test_nmf_minibatchnmf_equivalence(beta_loss): # Test that MiniBatchNMF is equivalent to NMF when batch_size = n_samples and # forget_factor 0.0 (stopping criterion put aside) rng = np.random.mtrand.RandomState(42) X = np.abs(rng.randn(48, 5)) nmf = NMF( n_components=5, beta_loss=beta_loss, solver="mu", random_state=0, tol=0, ) mbnmf = MiniBatchNMF( n_components=5, beta_loss=beta_loss, random_state=0, tol=0, max_no_improvement=None, batch_size=X.shape[0], forget_factor=0.0, ) W = nmf.fit_transform(X) mbW = mbnmf.fit_transform(X) assert_allclose(W, mbW)
def test_minibatch_nmf_partial_fit(): # Check fit / partial_fit equivalence. Applicable only with fresh restarts. rng = np.random.mtrand.RandomState(42) X = np.abs(rng.randn(100, 5)) n_components = 5 batch_size = 10 max_iter = 2 mbnmf1 = MiniBatchNMF( n_components=n_components, init="custom", random_state=0, max_iter=max_iter, batch_size=batch_size, tol=0, max_no_improvement=None, fresh_restarts=False, ) mbnmf2 = MiniBatchNMF(n_components=n_components, init="custom", random_state=0) # Force the same init of H (W is recomputed anyway) to be able to compare results. W, H = nmf._initialize_nmf( X, n_components=n_components, init="random", random_state=0 ) mbnmf1.fit(X, W=W, H=H) for i in range(max_iter): for j in range(batch_size): mbnmf2.partial_fit(X[j : j + batch_size], W=W[:batch_size], H=H) assert mbnmf1.n_steps_ == mbnmf2.n_steps_ assert_allclose(mbnmf1.components_, mbnmf2.components_)
def test_minibatch_nmf_transform(): # Test that fit_transform is equivalent to fit.transform for MiniBatchNMF # Only guaranteed with fresh restarts rng = np.random.mtrand.RandomState(42) A = np.abs(rng.randn(6, 5)) m = MiniBatchNMF( n_components=3, random_state=0, tol=1e-3, fresh_restarts=True, ) ft = m.fit_transform(A) t = m.transform(A) assert_allclose(ft, t)
def test_parameter_checking(): A = np.ones((2, 2)) name = "spam" with ignore_warnings(category=FutureWarning): # TODO remove in 1.2 msg = "Invalid regularization parameter: got 'spam' instead of one of" with pytest.raises(ValueError, match=msg): NMF(regularization=name).fit(A) msg = "Invalid beta_loss parameter: solver 'cd' does not handle beta_loss = 1.0" with pytest.raises(ValueError, match=msg): NMF(solver="cd", beta_loss=1.0).fit(A) msg = "Negative values in data passed to" with pytest.raises(ValueError, match=msg): NMF().fit(-A) clf = NMF(2, tol=0.1).fit(A) with pytest.raises(ValueError, match=msg): clf.transform(-A) with pytest.raises(ValueError, match=msg): nmf._initialize_nmf(-A, 2, "nndsvd") for init in ["nndsvd", "nndsvda", "nndsvdar"]: msg = re.escape( "init = '{}' can only be used when " "n_components <= min(n_samples, n_features)".format(init)) with pytest.raises(ValueError, match=msg): NMF(3, init=init).fit(A) with pytest.raises(ValueError, match=msg): MiniBatchNMF(3, init=init).fit(A) with pytest.raises(ValueError, match=msg): nmf._initialize_nmf(A, 3, init)
def test_parameter_checking(): # Here we only check for invalid parameter values that are not already # automatically tested in the common tests. A = np.ones((2, 2)) msg = "Invalid beta_loss parameter: solver 'cd' does not handle beta_loss = 1.0" with pytest.raises(ValueError, match=msg): NMF(solver="cd", beta_loss=1.0).fit(A) msg = "Negative values in data passed to" with pytest.raises(ValueError, match=msg): NMF().fit(-A) clf = NMF(2, tol=0.1).fit(A) with pytest.raises(ValueError, match=msg): clf.transform(-A) with pytest.raises(ValueError, match=msg): nmf._initialize_nmf(-A, 2, "nndsvd") for init in ["nndsvd", "nndsvda", "nndsvdar"]: msg = re.escape( "init = '{}' can only be used when " "n_components <= min(n_samples, n_features)".format(init) ) with pytest.raises(ValueError, match=msg): NMF(3, init=init).fit(A) with pytest.raises(ValueError, match=msg): MiniBatchNMF(3, init=init).fit(A) with pytest.raises(ValueError, match=msg): nmf._initialize_nmf(A, 3, init)
def test_nmf_true_reconstruction(): # Test that the fit is not too far away from an exact solution # (by construction) n_samples = 15 n_features = 10 n_components = 5 beta_loss = 1 batch_size = 3 max_iter = 1000 rng = np.random.mtrand.RandomState(42) W_true = np.zeros([n_samples, n_components]) W_array = np.abs(rng.randn(n_samples)) for j in range(n_components): W_true[j % n_samples, j] = W_array[j % n_samples] H_true = np.zeros([n_components, n_features]) H_array = np.abs(rng.randn(n_components)) for j in range(n_features): H_true[j % n_components, j] = H_array[j % n_components] X = np.dot(W_true, H_true) model = NMF( n_components=n_components, solver="mu", beta_loss=beta_loss, max_iter=max_iter, random_state=0, ) transf = model.fit_transform(X) X_calc = np.dot(transf, model.components_) assert model.reconstruction_err_ < 0.1 assert_allclose(X, X_calc) mbmodel = MiniBatchNMF( n_components=n_components, beta_loss=beta_loss, batch_size=batch_size, random_state=0, max_iter=max_iter, ) transf = mbmodel.fit_transform(X) X_calc = np.dot(transf, mbmodel.components_) assert mbmodel.reconstruction_err_ < 0.1 assert_allclose(X, X_calc, atol=1)
def test_minibatch_nmf_verbose(): # Check verbose mode of MiniBatchNMF for better coverage. A = np.random.RandomState(0).random_sample((100, 10)) nmf = MiniBatchNMF(tol=1e-2, random_state=0, verbose=1) old_stdout = sys.stdout sys.stdout = StringIO() try: nmf.fit(A) finally: sys.stdout = old_stdout
def test_minibatch_nmf_negative_beta_loss(beta_loss): """Check that an error is raised if beta_loss < 0 and X contains zeros.""" rng = np.random.RandomState(0) X = rng.normal(size=(6, 5)) X[X < 0] = 0 nmf = MiniBatchNMF(beta_loss=beta_loss, random_state=0) msg = "When beta_loss <= 0 and X contains zeros, the solver may diverge." with pytest.raises(ValueError, match=msg): nmf.fit(X)
def test_mbnmf_inverse_transform(): # Test that MiniBatchNMF.transform followed by MiniBatchNMF.inverse_transform # is close to the identity rng = np.random.RandomState(0) A = np.abs(rng.randn(6, 4)) nmf = MiniBatchNMF( random_state=rng, max_iter=500, init="nndsvdar", fresh_restarts=True, ) ft = nmf.fit_transform(A) A_new = nmf.inverse_transform(ft) assert_allclose(A, A_new, rtol=1e-3, atol=1e-2)
def test_minibatch_nmf_wrong_params(param, match): # Check that appropriate errors are raised for invalid values specific to # MiniBatchNMF parameters A = np.ones((2, 2)) with pytest.raises(ValueError, match=match): MiniBatchNMF(**param).fit(A)
# accurate version of non-negative matrix factorization (:class:`decomposition.NMF`). # :class:`MiniBatchNMF` divides the data into mini-batches and optimizes the NMF model # in an online manner by cycling over the mini-batches, making it better suited for # large datasets. In particular, it implements `partial_fit`, which can be used for # online learning when the data is not readily available from the start, or when the # data does not fit into memory. import numpy as np from sklearn.decomposition import MiniBatchNMF rng = np.random.RandomState(0) n_samples, n_features, n_components = 10, 10, 5 true_W = rng.uniform(size=(n_samples, n_components)) true_H = rng.uniform(size=(n_components, n_features)) X = true_W @ true_H nmf = MiniBatchNMF(n_components=n_components, random_state=0) for _ in range(10): nmf.partial_fit(X) W = nmf.transform(X) H = nmf.components_ X_reconstructed = W @ H print( f"relative reconstruction error: ", f"{np.sum((X - X_reconstructed) ** 2) / np.sum(X**2):.5f}", ) # %% # BisectingKMeans: divide and cluster