def test_mini_batch(): np.random.seed(0) T = 100 # init N = 16 M = 20 Y = np.random.randn(M * T, N) K = 4 tl = MiniBatchTransformLearning(transform_n_nonzero_coefs=K) for i in range(T): ys = Y[i * M:(i + 1) * M] tl.partial_fit(ys) sc = SparseCoder(tl.components_, transform_n_nonzero_coefs=K) code = sc.fit_transform(Y) tl_error = np.linalg.norm(Y - code.dot(tl.components_)) print('Mini Batch Transform Learning:', tl_error) W = np.random.randn(N, N) sc = SparseCoder(W, transform_n_nonzero_coefs=K) code = sc.fit_transform(Y) random_error = np.linalg.norm(Y - code.dot(W)) print('Random Dictionary:', random_error) assert tl_error < random_error
def test_max_iter(): def ricker_function(resolution, center, width): """Discrete sub-sampled Ricker (Mexican hat) wavelet""" x = np.linspace(0, resolution - 1, resolution) x = ( (2 / (np.sqrt(3 * width) * np.pi**0.25)) * (1 - (x - center) ** 2 / width**2) * np.exp(-((x - center) ** 2) / (2 * width**2)) ) return x def ricker_matrix(width, resolution, n_components): """Dictionary of Ricker (Mexican hat) wavelets""" centers = np.linspace(0, resolution - 1, n_components) D = np.empty((n_components, resolution)) for i, center in enumerate(centers): D[i] = ricker_function(resolution, center, width) D /= np.sqrt(np.sum(D**2, axis=1))[:, np.newaxis] return D transform_algorithm = "lasso_cd" resolution = 1024 subsampling = 3 # subsampling factor n_components = resolution // subsampling # Compute a wavelet dictionary D_multi = np.r_[ tuple( ricker_matrix( width=w, resolution=resolution, n_components=n_components // 5 ) for w in (10, 50, 100, 500, 1000) ) ] X = np.linspace(0, resolution - 1, resolution) first_quarter = X < resolution / 4 X[first_quarter] = 3.0 X[np.logical_not(first_quarter)] = -1.0 X = X.reshape(1, -1) # check that the underlying model fails to converge with pytest.warns(ConvergenceWarning): model = SparseCoder( D_multi, transform_algorithm=transform_algorithm, transform_max_iter=1 ) model.fit_transform(X) # check that the underlying model converges w/o warnings with pytest.warns(None) as record: model = SparseCoder( D_multi, transform_algorithm=transform_algorithm, transform_max_iter=2000 ) model.fit_transform(X) assert not [w.message for w in record]
def test_sparse_coder_parallel_mmap(): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/5956 # Test that SparseCoder does not error by passing reading only # arrays to child processes rng = np.random.RandomState(777) n_components, n_features = 40, 64 init_dict = rng.rand(n_components, n_features) # Ensure that `data` is >2M. Joblib memory maps arrays # if they are larger than 1MB. The 4 accounts for float32 # data type n_samples = int(2e6) // (4 * n_features) data = np.random.rand(n_samples, n_features).astype(np.float32) sc = SparseCoder(init_dict, transform_algorithm='omp', n_jobs=2) sc.fit_transform(data)
def test_sparse_coder_parallel_mmap(): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/5956 # Test that SparseCoder does not error by passing reading only # arrays to child processes rng = np.random.RandomState(777) n_components, n_features = 40, 64 init_dict = rng.rand(n_components, n_features) # Ensure that `data` is >2M. Joblib memory maps arrays # if they are larger than 1MB. The 4 accounts for float32 # data type n_samples = int(2e6) // (4 * n_features) data = np.random.rand(n_samples, n_features).astype(np.float32) sc = SparseCoder(init_dict, transform_algorithm='omp', n_jobs=2) sc.fit_transform(data)
def test_fit(): np.random.seed(0) N = 32 K = 4 X = np.random.randn(100, N) X /= np.linalg.norm(X, axis=1)[:, np.newaxis] tl = OnlineTransformLearning(transform_n_nonzero_coefs=K).fit(X) sc = SparseCoder(tl.components_, transform_n_nonzero_coefs=K) code = sc.fit_transform(X) tl_error = np.linalg.norm(X - code.dot(tl.components_)) print('Online Transform Learning:', tl_error) W = np.random.randn(N, N) sc = SparseCoder(W, transform_n_nonzero_coefs=K) code = sc.fit_transform(X) random_error = np.linalg.norm(X - code.dot(W)) print('Random Dictionary:', random_error) assert tl_error < random_error
def cluster_sk_sparse_coder(content): """ x """ _config = SparseCoder(n_components=content['n_components'], alpha=content['alpha'], ridge_alpha=content['ridge_alpha'], max_iter=content['max_iter'], tol=content['tol'], method=content['sk_method'], n_jobs=-1) _result = _config.fit_transform(content['data']) return httpWrapper( json.dumps({ 'result': _result.tolist(), 'components': _config.components_.tolist(), 'error': _config.error_, 'iter': _config.n_iter_ }))
keys = [] for word in vocab: # if word.startswith('http://'): # continue # # if word.startswith('https://'): # continue keys.append(word) X.append(random(2000)) y.append(vocab[word]) print np.shape(U) print np.shape(X) X = coder.fit_transform(X, y) print np.shape(X) codes = {} for k in range(len(keys)): codes[keys[k]] = X[k] pickle.dump(codes, open('word.codes', 'w')) #print codes['m.v.p.'] print keys[-1] + ': ' + str(codes[keys[-1]]) #print vocab[keys[-1]]
class SMHClassifier(BaseEstimator): """ SMH-based classifier. """ def __init__(self, tuple_size=3, n_tuples=692, wcc=None, ovr_thres=0.7): self.tuple_size = tuple_size if wcc: self.wcc = wcc self.n_tuples = log(0.5) / log(1.0 - pow(wcc, tuple_size)) else: self.n_tuples = n_tuples def discover_topics(self, X, tuple_size=3, n_tuples=692, weights=True, expand=True, thres=0.7, cutoff=3): """ Discovers topics from a text corpus. """ ifs = array_to_listdb(X) mined = ifs.mine(tuple_size=tuple_size, num_tuples=n_tuples, weights=weights, expand=expand) mined.cutoff(min=cutoff) models = mined.cluster_mhlink(thres=thres) return models def fit(self, X, tuple_size=3, n_tuples=692, weights=True, expand=True, thres=0.7, cutoff=3): """ Discovers topics and used them as a dictionary for sparse-coding. """ models = self.discover_topics(X, tuple_size=tuple_size, n_tuples=n_tuples, weights=weights, expand=expand, thres=thres, cutoff=cutoff) self.coder = SparseCoder(dictionary=normalize(models.toarray()), transform_algorithm='lasso_lars', split_sign=True, n_jobs=4) def fit_transform(self, X, tuple_size=3, n_tuples=692, weights=None, expand=None, thres=0.7, cutoff=3): """ Discovers topics and used them as a dictionary to sparse-code the documents. """ self.fit(X, tuple_size=tuple_size, n_tuples=n_tuples, weights=weights, expand=expand, thres=thres, cutoff=cutoff) return self.coder.fit_transform(X.todense()) def transform(self, X): """ Sparse-code a given set of documents from the discovered topics. """ return self.coder.transform(X.todense())
y = smooth(y, window_len=resolution // 100, window='hanning') D = D_fixed n_nonzero = 10 alpha = None algo = 'omp' color_1 = 'red' title = algo.upper() coder_1 = SparseCoder(dictionary=D, transform_n_nonzero_coefs=n_nonzero, transform_alpha=alpha, transform_algorithm=algo) x_ = coder_1.fit_transform(y.reshape(1, -1)) density = len(np.flatnonzero(x_)) x = np.ravel(np.dot(x_, D)) mean_squared_error = mean_squared_error(y, x) plt.plot(y, color='black', lw=2, linestyle='--', label='Original signal', alpha=0.5) plt.plot(x, color=color_1, lw=2, label='%s: %s nonzero coefs,\n%.2f error' % (title, density, mean_squared_error),
def generate(alg='lasso_cd', alpha=.002, d=100, verbose=True): print 'Starting up...' # load preprocessed data U = pickle.load(open('U.pkl', 'r')) # common embedding matrix # vocab = pickle.load(open('vocab.pkl', 'r')) # vocabulary vocab = pickle.load(open('minivocab.pkl', 'r')) # vocabulary codes = pickle.load(open('word.codes', 'r')) # sparse codes for common words w = [] #print U #def sparse_loss(y_pred, w): # ''' # Helper function. Custom loss function to be used in model. # ''' # v = np.dot(U, y_pred) - w # v = v + alpha * np.sum(y_pred) # v = v + beta * abs(np.sum(y_pred) - 1) # # return math.sqrt(np.dot(v, v)) if verbose: print 'Initializing parameters...' # initialize parameters beta = .2 max_len = 24 # compute codes coder = SparseCoder(U, transform_algorithm=alg, transform_alpha=alpha) #, split_sign=True) X = [] y = [] keys = [] if verbose: print 'Generating data arrays' for word in vocab: # if word.startswith('http://'): # continue # # if word.startswith('https://'): # continue keys.append(word) X.append(random(d)) y.append(vocab[word]) if verbose: print np.shape(U) print np.shape(X) X = coder.fit_transform(X, y) if verbose: print np.shape(X) codes = {} for k in range(len(keys)): codes[keys[k]] = X[k] pickle.dump(codes, open('word.codes', 'w')) if verbose: #print codes['m.v.p.'] print keys[-1] + ': ' + str(codes[keys[-1]])
def nmf_multiple_parts(directory, initial_Z=10, final_Z=40): print datetime.now() print "NMF in stages..." os.chdir(directory) pxy_list = [] for (dirpath, dirnames, filenames) in os.walk(directory): pxy_list.extend(filenames) break # Initial NMF analysis of each day (could do dictionary learning??) print "Learning bases for each chunk..." NMF_basis_dict = {} for pxy_name in pxy_list: # read HF spectrogram pxy_chunk = np.load(pxy_name).transpose( ) #transpose so time runs vertically down (each sample is a row) # NMF analysis print datetime.now() print "Analysing %s" % pxy_name NMF_learner = NMF( n_components=initial_Z) #start with a small # of components here NMF_learner.fit( pxy_chunk) #only need to fit as just dictionary is wanted NMF_basis = NMF_learner.components_ #NM_weights_desc = pd.DataFrame(NM_power_weights.transpose()).describe() #print NM_weights_desc NMF_basis_dict[str(pxy_name)] = NMF_basis print "Basis vectors learned for %s" % pxy_name # Clustering of the learned bases print datetime.now() print "Clustering learned bases" # create dataset of all learned basis (dictionary) components NMF_bases_all = np.concatenate(NMF_basis_dict.values(), axis=0) # cluster learned components into final_Z clusters KMeans_learner = KMeans(n_clusters=final_Z) KMeans_learner.fit(NMF_bases_all) # set decomposition basis to be cluster centres Clustered_basis = KMeans_learner.cluster_centers_ print "Cluster centres found" # Learn weights (Y) for decomposition X = BY - Sparse coding with a pre-computed dictionary print "Learning weights for new cluster centres" SC_weights_dict = {} for pxy_name in pxy_list: # read HF spectrogram pxy_chunk = np.load(pxy_name).transpose( ) #transpose so time runs vertically down (each sample is a row) # SC analysis with fixed dictionary print datetime.now() print "Learning Sparse Coding weights for %s" % pxy_name coder = SparseCoder(Clustered_basis, transform_n_nonzero_coefs=final_Z / 2) # aim for half the coefficients being non-zero SC_weights = coder.fit_transform(pxy_chunk) # file_name = 'scweights/%s' % pxy_name # np.save(file_name, SC_weights) SC_weights_dict[str(pxy_name)] = SC_weights print "Sparse Coding weights learned for %s" % pxy_name return NMF_basis_dict, Clustered_basis, SC_weights_dict