def test_mini_batch():
    np.random.seed(0)

    T = 100
    # init
    N = 16
    M = 20
    Y = np.random.randn(M * T, N)
    K = 4

    tl = MiniBatchTransformLearning(transform_n_nonzero_coefs=K)
    for i in range(T):
        ys = Y[i * M:(i + 1) * M]
        tl.partial_fit(ys)

    sc = SparseCoder(tl.components_, transform_n_nonzero_coefs=K)
    code = sc.fit_transform(Y)
    tl_error = np.linalg.norm(Y - code.dot(tl.components_))
    print('Mini Batch Transform Learning:', tl_error)

    W = np.random.randn(N, N)
    sc = SparseCoder(W, transform_n_nonzero_coefs=K)
    code = sc.fit_transform(Y)
    random_error = np.linalg.norm(Y - code.dot(W))
    print('Random Dictionary:', random_error)
    assert tl_error < random_error
示例#2
0
def test_max_iter():
    def ricker_function(resolution, center, width):
        """Discrete sub-sampled Ricker (Mexican hat) wavelet"""
        x = np.linspace(0, resolution - 1, resolution)
        x = (
            (2 / (np.sqrt(3 * width) * np.pi**0.25))
            * (1 - (x - center) ** 2 / width**2)
            * np.exp(-((x - center) ** 2) / (2 * width**2))
        )
        return x

    def ricker_matrix(width, resolution, n_components):
        """Dictionary of Ricker (Mexican hat) wavelets"""
        centers = np.linspace(0, resolution - 1, n_components)
        D = np.empty((n_components, resolution))
        for i, center in enumerate(centers):
            D[i] = ricker_function(resolution, center, width)
        D /= np.sqrt(np.sum(D**2, axis=1))[:, np.newaxis]
        return D

    transform_algorithm = "lasso_cd"
    resolution = 1024
    subsampling = 3  # subsampling factor
    n_components = resolution // subsampling

    # Compute a wavelet dictionary
    D_multi = np.r_[
        tuple(
            ricker_matrix(
                width=w, resolution=resolution, n_components=n_components // 5
            )
            for w in (10, 50, 100, 500, 1000)
        )
    ]

    X = np.linspace(0, resolution - 1, resolution)
    first_quarter = X < resolution / 4
    X[first_quarter] = 3.0
    X[np.logical_not(first_quarter)] = -1.0
    X = X.reshape(1, -1)

    # check that the underlying model fails to converge
    with pytest.warns(ConvergenceWarning):
        model = SparseCoder(
            D_multi, transform_algorithm=transform_algorithm, transform_max_iter=1
        )
        model.fit_transform(X)

    # check that the underlying model converges w/o warnings
    with pytest.warns(None) as record:
        model = SparseCoder(
            D_multi, transform_algorithm=transform_algorithm, transform_max_iter=2000
        )
        model.fit_transform(X)
    assert not [w.message for w in record]
示例#3
0
def test_sparse_coder_parallel_mmap():
    # Non-regression test for:
    # https://github.com/scikit-learn/scikit-learn/issues/5956
    # Test that SparseCoder does not error by passing reading only
    # arrays to child processes

    rng = np.random.RandomState(777)
    n_components, n_features = 40, 64
    init_dict = rng.rand(n_components, n_features)
    # Ensure that `data` is >2M. Joblib memory maps arrays
    # if they are larger than 1MB. The 4 accounts for float32
    # data type
    n_samples = int(2e6) // (4 * n_features)
    data = np.random.rand(n_samples, n_features).astype(np.float32)

    sc = SparseCoder(init_dict, transform_algorithm='omp', n_jobs=2)
    sc.fit_transform(data)
def test_sparse_coder_parallel_mmap():
    # Non-regression test for:
    # https://github.com/scikit-learn/scikit-learn/issues/5956
    # Test that SparseCoder does not error by passing reading only
    # arrays to child processes

    rng = np.random.RandomState(777)
    n_components, n_features = 40, 64
    init_dict = rng.rand(n_components, n_features)
    # Ensure that `data` is >2M. Joblib memory maps arrays
    # if they are larger than 1MB. The 4 accounts for float32
    # data type
    n_samples = int(2e6) // (4 * n_features)
    data = np.random.rand(n_samples, n_features).astype(np.float32)

    sc = SparseCoder(init_dict, transform_algorithm='omp', n_jobs=2)
    sc.fit_transform(data)
示例#5
0
def test_fit():
    np.random.seed(0)
    N = 32
    K = 4

    X = np.random.randn(100, N)
    X /= np.linalg.norm(X, axis=1)[:, np.newaxis]

    tl = OnlineTransformLearning(transform_n_nonzero_coefs=K).fit(X)
    sc = SparseCoder(tl.components_, transform_n_nonzero_coefs=K)
    code = sc.fit_transform(X)
    tl_error = np.linalg.norm(X - code.dot(tl.components_))
    print('Online Transform Learning:', tl_error)

    W = np.random.randn(N, N)
    sc = SparseCoder(W, transform_n_nonzero_coefs=K)
    code = sc.fit_transform(X)
    random_error = np.linalg.norm(X - code.dot(W))
    print('Random Dictionary:', random_error)
    assert tl_error < random_error
示例#6
0
def cluster_sk_sparse_coder(content):
    """ x """
    _config = SparseCoder(n_components=content['n_components'],
                          alpha=content['alpha'],
                          ridge_alpha=content['ridge_alpha'],
                          max_iter=content['max_iter'],
                          tol=content['tol'],
                          method=content['sk_method'],
                          n_jobs=-1)
    _result = _config.fit_transform(content['data'])
    return httpWrapper(
        json.dumps({
            'result': _result.tolist(),
            'components': _config.components_.tolist(),
            'error': _config.error_,
            'iter': _config.n_iter_
        }))
示例#7
0
keys = []

for word in vocab:
#    if word.startswith('http://'):
#        continue
#
#    if word.startswith('https://'):
#        continue
    
    keys.append(word)
    X.append(random(2000))
    y.append(vocab[word])

print np.shape(U)
print np.shape(X)
    
X = coder.fit_transform(X, y)

print np.shape(X)

codes = {}

for k in range(len(keys)):
    codes[keys[k]] = X[k]
    
pickle.dump(codes, open('word.codes', 'w'))

#print codes['m.v.p.']
print keys[-1] + ': ' + str(codes[keys[-1]])
#print vocab[keys[-1]]
class SMHClassifier(BaseEstimator):
    """
    SMH-based classifier.
    """
    def __init__(self, tuple_size=3, n_tuples=692,
                 wcc=None, ovr_thres=0.7):
        self.tuple_size = tuple_size

        if wcc:
            self.wcc = wcc
            self.n_tuples = log(0.5) / log(1.0 - pow(wcc, tuple_size))
        else:
            self.n_tuples = n_tuples

    def discover_topics(self, X, tuple_size=3, n_tuples=692,
                        weights=True, expand=True,
                        thres=0.7, cutoff=3):
        """
        Discovers topics from a text corpus.
        """
        ifs = array_to_listdb(X)
        mined = ifs.mine(tuple_size=tuple_size,
                         num_tuples=n_tuples,
                         weights=weights,
                         expand=expand)
        mined.cutoff(min=cutoff)
        models = mined.cluster_mhlink(thres=thres)
    
        return models

    def fit(self, X, tuple_size=3, n_tuples=692,
            weights=True, expand=True,
            thres=0.7, cutoff=3):
        """
        Discovers topics and used them as a dictionary for sparse-coding.
        """
        models = self.discover_topics(X,
                                      tuple_size=tuple_size,
                                      n_tuples=n_tuples,
                                      weights=weights,
                                      expand=expand,
                                      thres=thres,
                                      cutoff=cutoff)
        self.coder = SparseCoder(dictionary=normalize(models.toarray()),
                                 transform_algorithm='lasso_lars',
                                 split_sign=True,
                                 n_jobs=4)
        
    def fit_transform(self, X, tuple_size=3, n_tuples=692,
                      weights=None, expand=None,
                      thres=0.7, cutoff=3):
        """
        Discovers topics and used them as a dictionary to sparse-code
        the documents.
        """
        self.fit(X, tuple_size=tuple_size, n_tuples=n_tuples, weights=weights, expand=expand,
                 thres=thres, cutoff=cutoff)
        return self.coder.fit_transform(X.todense())

    
    def transform(self, X):
        """
        Sparse-code a given set of documents from the
        discovered topics.
        """
        return self.coder.transform(X.todense())
示例#9
0
y = smooth(y, window_len=resolution // 100, window='hanning')

D = D_fixed
n_nonzero = 10
alpha = None
algo = 'omp'
color_1 = 'red'
title = algo.upper()

coder_1 = SparseCoder(dictionary=D,
                      transform_n_nonzero_coefs=n_nonzero,
                      transform_alpha=alpha,
                      transform_algorithm=algo)

x_ = coder_1.fit_transform(y.reshape(1, -1))
density = len(np.flatnonzero(x_))
x = np.ravel(np.dot(x_, D))
mean_squared_error = mean_squared_error(y, x)

plt.plot(y,
         color='black',
         lw=2,
         linestyle='--',
         label='Original signal',
         alpha=0.5)
plt.plot(x,
         color=color_1,
         lw=2,
         label='%s: %s nonzero coefs,\n%.2f error' %
         (title, density, mean_squared_error),
示例#10
0
def generate(alg='lasso_cd', alpha=.002, d=100, verbose=True):
    print 'Starting up...'

    # load preprocessed data
    U = pickle.load(open('U.pkl', 'r'))  # common embedding matrix
    #    vocab = pickle.load(open('vocab.pkl', 'r')) # vocabulary
    vocab = pickle.load(open('minivocab.pkl', 'r'))  # vocabulary
    codes = pickle.load(open('word.codes',
                             'r'))  # sparse codes for common words

    w = []

    #print U

    #def sparse_loss(y_pred, w):
    #    '''
    #        Helper function. Custom loss function to be used in model.
    #    '''
    #    v = np.dot(U, y_pred) - w
    #    v = v + alpha * np.sum(y_pred)
    #    v = v + beta * abs(np.sum(y_pred) - 1)
    #
    #    return math.sqrt(np.dot(v, v))

    if verbose:
        print 'Initializing parameters...'

    # initialize parameters
    beta = .2

    max_len = 24

    # compute codes
    coder = SparseCoder(U, transform_algorithm=alg,
                        transform_alpha=alpha)  #, split_sign=True)

    X = []
    y = []
    keys = []

    if verbose:
        print 'Generating data arrays'

    for word in vocab:
        #    if word.startswith('http://'):
        #        continue
        #
        #    if word.startswith('https://'):
        #        continue

        keys.append(word)
        X.append(random(d))
        y.append(vocab[word])

    if verbose:
        print np.shape(U)
        print np.shape(X)

    X = coder.fit_transform(X, y)

    if verbose:
        print np.shape(X)

    codes = {}

    for k in range(len(keys)):
        codes[keys[k]] = X[k]

    pickle.dump(codes, open('word.codes', 'w'))

    if verbose:
        #print codes['m.v.p.']
        print keys[-1] + ': ' + str(codes[keys[-1]])
示例#11
0
def nmf_multiple_parts(directory, initial_Z=10, final_Z=40):
    print datetime.now()
    print "NMF in stages..."
    os.chdir(directory)
    pxy_list = []
    for (dirpath, dirnames, filenames) in os.walk(directory):
        pxy_list.extend(filenames)
        break

    # Initial NMF analysis of each day (could do dictionary learning??)
    print "Learning bases for each chunk..."
    NMF_basis_dict = {}
    for pxy_name in pxy_list:
        # read HF spectrogram
        pxy_chunk = np.load(pxy_name).transpose(
        )  #transpose so time runs vertically down (each sample is a row)

        # NMF analysis
        print datetime.now()
        print "Analysing %s" % pxy_name
        NMF_learner = NMF(
            n_components=initial_Z)  #start with a small # of components here
        NMF_learner.fit(
            pxy_chunk)  #only need to fit as just dictionary is wanted
        NMF_basis = NMF_learner.components_
        #NM_weights_desc = pd.DataFrame(NM_power_weights.transpose()).describe()
        #print NM_weights_desc
        NMF_basis_dict[str(pxy_name)] = NMF_basis
        print "Basis vectors learned for %s" % pxy_name

    # Clustering of the learned bases
    print datetime.now()
    print "Clustering learned bases"

    # create dataset of all learned basis (dictionary) components
    NMF_bases_all = np.concatenate(NMF_basis_dict.values(), axis=0)

    # cluster learned components into final_Z clusters
    KMeans_learner = KMeans(n_clusters=final_Z)
    KMeans_learner.fit(NMF_bases_all)

    # set decomposition basis to be cluster centres
    Clustered_basis = KMeans_learner.cluster_centers_
    print "Cluster centres found"

    # Learn weights (Y) for decomposition X = BY - Sparse coding with a pre-computed dictionary
    print "Learning weights for new cluster centres"
    SC_weights_dict = {}
    for pxy_name in pxy_list:
        # read HF spectrogram
        pxy_chunk = np.load(pxy_name).transpose(
        )  #transpose so time runs vertically down (each sample is a row)

        # SC analysis with fixed dictionary
        print datetime.now()
        print "Learning Sparse Coding weights for %s" % pxy_name
        coder = SparseCoder(Clustered_basis,
                            transform_n_nonzero_coefs=final_Z /
                            2)  # aim for half the coefficients being non-zero
        SC_weights = coder.fit_transform(pxy_chunk)
        #        file_name = 'scweights/%s' % pxy_name
        #        np.save(file_name, SC_weights)
        SC_weights_dict[str(pxy_name)] = SC_weights
        print "Sparse Coding weights learned for %s" % pxy_name

    return NMF_basis_dict, Clustered_basis, SC_weights_dict