Exemplo n.º 1
0
def normalized_laplacian_embed(G,d, scaled=False):
    """ Generates an n by d matrix using an svd of the normalized laplacian
    
    Each row of the output corresponds to a node (ordered according to G.node)
    so that each node is assigned a vector in d-dimensional euclidean space.
    
    Parameters
    ----------
    G -- networkx graph
    d -- embedding dimension
    scaled -- whether to scaled the embedding by the square root
             of the eigenvalues (default=False) 
             
    Returns
    -------
    n times d matrix where n=G.number_of_nodes()
    """
    L  = laplacian_matrix(G)
    
    if scaled:
        u,s,_ = la.svds(sparse.csr_matrix(L), d)
        return np.dot(u,np.diag(np.sqrt(s)))
    else:
        u,_,_ = la.svds(sparse.csr_matrix(L), d)
        return u
Exemplo n.º 2
0
def fastsvds(M,r): 
    """
    "Fast" but less accurate SVD by computing the SVD of MM^T or M^TM 
    ***IF*** one of the dimensions of M is much smaller than the other. 
    Note. This is numerically less stable, but useful for large hyperspectral 
    images. 

    """

    m,n = M.shape 
    rationmn = 10 # Parameter, should be >= 1

    if m < rationmn*n: 
        MMt = np.dot(M,M.T)
        u,s,v = svds(MMt,r)
        s = np.diag(s)
        v = np.dot(M.T, u) 
        v = np.multiply(v,repmat( (sum(v**2)+1e-16)**(-0.5),n,1)) 
        s = np.sqrt(s) 
    elif n < rationmn*m:
        MtM = np.dot(M.T,M)
        u,s,v = svds(MtM,r) 
        s = np.diag(s)
        u = np.dot(M,v) 
        u = np.multiply(u,repmat( (sum(u**2)+1e-16)**(-0.5),m,1))
        s = np.sqrt(s) 
    else:
        u,s,v = svds(M,r) 
        s = np.diag(s)
    return (u,s,v)
Exemplo n.º 3
0
def sci_pseudoinverse(Mat, precision):
    """
    Pseudoinverse computation.
    pseudoinverse using scipy.
    The function takes a sparse matrix and a precision score as the input.

    """
    matrix = Mat.tocsc()
    if matrix.shape[0] <= matrix.shape[1]:
        val = int((precision * matrix.shape[0]) / 100)
        u, s, vt = ssl.svds(matrix.tocsc(), k=val)
        UT = ss.csr_matrix(np.nan_to_num(u.transpose()))
        SI = ss.csr_matrix(np.nan_to_num(np.diag(1 / s)))
        VT = ss.csr_matrix(np.nan_to_num(vt))

        temp_matrix = spmatrixmul(VT.transpose(), SI)
        pinv_matrix = spmatrixmul(temp_matrix, UT)
        del u, s, vt, UT, SI, VT, temp_matrix

    else:
        val = int((precision * matrix.transpose().shape[0]) / 100)
        u, s, vt = ssl.svds(matrix.transpose().tocsc(), k=val)
        UT = ss.csr_matrix(np.nan_to_num(u.transpose()))
        SI = ss.csr_matrix(np.nan_to_num(np.diag(1 / s)))
        VT = ss.csr_matrix(np.nan_to_num(vt))

        temp_matrix = spmatrixmul(UT.transpose(), SI)
        pinv_matrix = spmatrixmul(temp_matrix, VT)
        del u, s, vt, UT, SI, VT, temp_matrix



    return pinv_matrix.tocsr()
Exemplo n.º 4
0
def rank_constrained_least_squares(X, Y, rank, alpha1, alpha2=None,
                                   U0=None, V0=None,
                                   max_bfgs_iter=500,
                                   m=10,
                                   gradient_tolerance=1e-5,
                                   callback=None,
                                   verbose=3):
    """
    Minimizes
    .5 * ||XUV.T - Y|| ** 2 + .5 * alpha * (||U|| ** 2 + ||V|| ** 2)

    """

    if alpha2 is None:
        alpha2 = alpha1

    energy_function = get_vec_and_grad_func(X, Y, alpha1, alpha2, rank, X.shape[1], callback=callback)
    #energy_gradient = get_grad_func(X, Y, alpha1, alpha2, rank, len(X.T))

    # if not already done, initialize U and V
    if V0 is None:
        if U0 is not None:
            # if only V0 is None initialize U with a least squares
            U = U0.copy()
            V = np.linalg.pinv(X.dot(U)).dot(Y).T
        else:
            # decompose a ridge solution
            _, largest_singular_value_of_X, _ = svds(X, k=1)
            ridge_penalty = largest_singular_value_of_X * .1
            ridge = Ridge(alpha=ridge_penalty)
            ridge_coef = ridge.fit(X, Y).coef_.T
            U, s, VT = svds(ridge_coef, k=rank)
            V = VT.T * np.sqrt(s)
            U *= np.sqrt(s)[np.newaxis, :]
    else:

        V = V0.copy()
        if U0 is None:
            raise Exception
        U = U0.copy()

    initial_UV_vec = np.vstack([U, V]).ravel()

    result = fmin_l_bfgs_b(energy_function,
                           x0=initial_UV_vec,
                           #fprime=energy_gradient,
                           #maxiter=max_bfgs_iter,
                           maxfun=max_bfgs_iter,
                           # gtol=gradient_tolerance,
                           m=m,
                           #callback=callback,
                           iprint=verbose)

    concat_matrix = result[0].reshape(-1, rank)
    n_features = X.shape[1]
    U_res = concat_matrix[:n_features]
    V_res = concat_matrix[n_features:]

    return U_res, V_res, result[1:]
def pca_analysis_dense(X_train,X_trest,n_components):
	u,s,v = linalg.svds(X_train,n_components)
	print(s.shape)	
	screePlot('original.pdf',s[::-1])
	normalized = normalize(X_train,norm='l1',axis = 0)
	u,s,v = linalg.svds(normalized,n_components)
	print(s.shape)	
	screePlot('normalized.pdf',s[::-1])
Exemplo n.º 6
0
 def get_singularvalues_v_cycle(self, nu0=0, nu1=1, all_svdvals=False, k_max=5, k_min=5):
     T, P_inv = self.get_v_cycle_it_matrix(nu0, nu1)
     if all_svdvals:
         return sp.linalg.svdvals(T.todense())
     else:
         svdval_list = sprsla.svds(T, k=k_max, which='LM', return_eigenvectors=False)
         svdval_list.append(sprsla.svds(T, k=k_min, which='SM', return_eigenvectors=False))
         return svdval_list
Exemplo n.º 7
0
def scipy_svds(a, k=6, ncv=None, return_vecs=True, **kwargs):
    """ Compute a number of singular value pairs """
    settings = {
        'k': k,
        'ncv': choose_ncv(k, a.shape[0]) if ncv is None else ncv,
        'return_singular_vectors': return_vecs}
    if return_vecs:
        uk, sk, vtk = spla.svds(a, **settings, **kwargs)
        so = np.argsort(-sk)
        return np.asmatrix(uk[:, so]), sk[so], np.asmatrix(vtk[so, :])
    else:
        sk = spla.svds(a, **settings, **kwargs)
        return sk[np.argsort(-sk)]
Exemplo n.º 8
0
def returnsvd(filepath, k):

    # Read data
    print("Creation of data with ratings")
    datam = pd.read_csv(filepath, engine="python", iterator=True, sep="::", chunksize=10000, usecols=[1, 2])
    data = pd.concat([chunk for chunk in datam], ignore_index=True)
    data.columns = ["item_id", "tag"]

    # remove upper/lowe case
    data.tag = data.tag.astype(str)
    data.tag = data.tag.apply(str.lower)

    count = data.groupby(["item_id", "tag"]).size()
    data = count.reset_index()
    data.columns = ["item_id", "tag_id", "count"]

    # sort by items and keep traces of the original indices
    inditem = np.sort(data["item_id"].unique())
    reinditem = pd.Series({inditem[i]: i for i in np.arange(len(inditem))})
    data["item_id"] = reinditem[data["item_id"].values].values

    # compute the occurence of tags
    indtag = np.sort(data["tag_id"].unique())
    reindtag = pd.Series({indtag[i]: i for i in np.arange(len(indtag))})
    data["tag_id"] = reindtag[data["tag_id"].values].values
    data_sparse = coo_matrix(
        (data["count"].values.astype(float), (data["item_id"].values, data["tag_id"].values))
    ).tolil()

    print("..........sparse matrix built")

    # compute the actual svd
    p, d, q = splin.svds(data_sparse.tocsc(), k)

    return p, d, q, reinditem
Exemplo n.º 9
0
 def _CFSVD(self, ratingsMat):
     user_ratings_mean = np.mean(ratingsMat, axis=1)  # mean over user ratings
     R_demeaned = ratingsMat - user_ratings_mean.reshape(-1, 1)
     from scipy.sparse.linalg import svds
     U, sigma, Vt = svds(R_demeaned, k=10)
     sigma = np.diag(sigma)
     self.all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
 def get(self):
     B = dok_matrix((self.rows,self.d), dtype=float32)
     for ((row,col,val),p) in self.sampler.get(with_probabilities=True):
         B[row,col] += val/(p*self.nnz)
     covariance = dot(B.transpose(),B)    
     (_,s,Vt) = svds(covariance, k=self.ell, maxiter=50, return_singular_vectors=True)
     return dot(diag(sqrt(s[:self.ell])), Vt[:self.ell,:])
Exemplo n.º 11
0
    def _fit(self, X):
        X = as_float_array(X, copy=False)
        random_state = check_random_state(self.random_state)

        if self.algorithm == "arpack":
            U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol)
            # svds doesn't abide by scipy.linalg.svd/randomized_svd
            # conventions, so reverse its outputs.
            Sigma = Sigma[::-1]
            U, VT = svd_flip(U[:, ::-1], VT[::-1])

        elif self.algorithm == "randomized":
            k = self.n_components
            n_features = X.shape[1]
            if k >= n_features:
                raise ValueError("n_components must be < n_features;"
                                 " got %d >= %d" % (k, n_features))
            U, Sigma, VT = randomized_svd(X, self.n_components,
                                          n_iter=self.n_iter,
                                          random_state=random_state)
        else:
            raise ValueError("unknown algorithm %r" % self.algorithm)

        self.components_ = VT
        return U, Sigma, VT
Exemplo n.º 12
0
def svd_wifis(wf_lists,hash_num,nk):
    data_size = wf_lists.shape[0]
    ij = np.zeros((2,data_size * 10))
    data = np.zeros((data_size * 10))
    row = data_i = 0
    macs = set()
    for wf_list in wf_lists:
        wf = common.str_to_wf(wf_list)
        for (k,v) in wf.iteritems():
            ki = long(k,base=16)
            mask = 0xffffffffffff
            if ki == 0 or ki & mask == mask:
                continue
#k = get_hash(k)
            k = get_mac_idx(ki)
            (ij[0,data_i],ij[1,data_i],data[data_i]) = (row,k,v)
            data_i += 1
            macs.add(k)
        row += 1
   
    m = sp.csr_matrix((data,ij))        
    (u,s,vt) = la.svds(m,k = min(nk,min(m.shape)//2))

    print m.todense()
    return u,s,vt
Exemplo n.º 13
0
def get_svd(data):
    input = np.genfromtxt(data,dtype = dt,delimiter = '\t')
    row = 0
    data_i = 0    
    ij = np.zeros((2,input.shape[0]*10),dtype=np.uint64)
    data = np.zeros((input.shape[0]*10),dtype=np.float)
    macs = set()
    for wf_list in input['wf_list']:
        wf = common.str_to_wf(wf_list)
        for (k,v) in wf.iteritems():
            ki = long(k,base=16)
            mask = 0xffffffffffff
            if ki == 0 or ki & mask == mask:
                continue
#k = get_hash(k)
            k = get_mac_idx(ki)
            (ij[0,data_i],ij[1,data_i],data[data_i]) = (row,k,v)
            data_i += 1
            macs.add(k)
        row += 1
   
#    print '%x,%x' % (ij[1,...].min(),ij[1,...].max())
    m = sp.csr_matrix((data,ij))        
#    print len(macs)
    (u,s,vt) = la.svds(m,k=10)
    print '\n'.join([ '\t'.join(p) for p in filter_small(u) ])
Exemplo n.º 14
0
def svd_factorize_matrix(y_mat, rank, return_embeddings=False):
    """
    exact approximation of a matrix using square loss an fully observed entries
    Args:
        y_mat: input matrix to approximate
        rank: rank of the approximation
        return_embeddings: boolean. If True, it returns the embeddings instead of the approximate matrix

    Returns:
        approximate matrix of the specified rank

    Example:
        >>> np.random.seed(1)
        >>> mat = toy_factorization_problem(5, 4)
        >>> svd_factorize_matrix(mat, 2)
        array([[ 3.492,  0.148,  1.681,  1.545],
               [ 2.356, -0.032,  1.273,  0.648],
               [ 6.038,  0.099,  3.074,  2.198],
               [ 3.338, -0.508,  2.295, -0.472],
               [ 0.09 ,  0.148, -0.11 ,  0.473]])
    """
    from scipy.sparse.linalg import svds
    u1_mat, d1_vec, v1_matt = svds(y_mat, rank)
    d1_diag_matrix = np.zeros((rank, rank))
    for i in range(rank):
        d1_diag_matrix[i, i] = np.sqrt(d1_vec[i])
    u = np.dot(u1_mat, d1_diag_matrix)
    v = np.dot(v1_matt.T, d1_diag_matrix)
    if return_embeddings:
        return u, v
    else:
        return np.dot(u, v.T)
Exemplo n.º 15
0
def rank_trunc(gram_mat, k, fast=True):
    """
    k-th order approximation of the Gram Matrix G.

    Parameters
    ----------
    gram_mat : array, shape (n_samples, n_samples)
        the Gram matrix
    k : int
        the order approximation
    fast : bool
        use svd (if False) or svds (if True).

    Return
    ------
    gram_mat_k : array, shape (n_samples, n_samples)
        The rank k Gram matrix.
    """
    if fast:
        u, s, v = svds(gram_mat, k)
        # pass  # TODO

    else:
        U, S, V = svd(gram_mat)  # U V [nxn]
        s = S[:k]
        u = U[:k, :k]
        v = V[:k, :k]
        # pass  # TODO
    gram_mat_k = (u.dot(np.diag(s))).dot(v)
    return gram_mat_k
Exemplo n.º 16
0
def computePCsPython(out_dir,k,bfile,ffile):
    """ reading in """
    RV = plink_reader.readBED(bfile,useMAFencoding=True)
    X  = RV['snps']

    """ normalizing markers """
    print 'Normalizing SNPs...'
    p_ref = X.mean(axis=0)/2.
    X -= 2*p_ref

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        X /= SP.sqrt(2*p_ref*(1-p_ref))
        
    hasNan = SP.any(SP.isnan(X),axis=0)
    print '%d SNPs have a nan entry. Exluding them for computing the covariance matrix.'%hasNan.sum()
    X  = X[:,~hasNan]

    
    """ computing prinicipal components """
    U,S,Vt = SSL.svds(X,k=k)
    U -= U.mean(0)
    U /= U.std(0)
    U  = U[:,::-1]
 
    """ saving to output """
    NP.savetxt(ffile, U, delimiter='\t',fmt='%.6f')    
    def _fit_truncated(self, X, n_components, svd_solver):
        """Fit the model by computing truncated SVD (by ARPACK or randomized)
        on X
        """
        n_samples, n_features = X.shape

        if isinstance(n_components, six.string_types):
            raise ValueError("n_components=%r cannot be a string "
                             "with svd_solver='%s'"
                             % (n_components, svd_solver))
        elif not 1 <= n_components <= n_features:
            raise ValueError("n_components=%r must be between 1 and "
                             "n_features=%r with svd_solver='%s'"
                             % (n_components, n_features, svd_solver))
        elif svd_solver == 'arpack' and n_components == n_features:
            raise ValueError("n_components=%r must be stricly less than "
                             "n_features=%r with svd_solver='%s'"
                             % (n_components, n_features, svd_solver))

        random_state = check_random_state(self.random_state)

        # Center data
        self.mean_ = np.mean(X, axis=0)
        X -= self.mean_

        if svd_solver == 'arpack':
            # random init solution, as ARPACK does it internally
            v0 = random_state.uniform(-1, 1, size=min(X.shape))
            U, S, V = svds(X, k=n_components, tol=self.tol, v0=v0)
            # svds doesn't abide by scipy.linalg.svd/randomized_svd
            # conventions, so reverse its outputs.
            S = S[::-1]
            # flip eigenvectors' sign to enforce deterministic output
            U, V = svd_flip(U[:, ::-1], V[::-1])

        elif svd_solver == 'randomized':
            # sign flipping is done inside
            U, S, V = randomized_svd(X, n_components=n_components,
                                     n_iter=self.iterated_power,
                                     flip_sign=True,
                                     random_state=random_state)

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = V
        self.n_components_ = n_components

        # Get variance explained by singular values
        self.explained_variance_ = (S ** 2) / (n_samples - 1)
        total_var = np.var(X, ddof=1, axis=0)
        self.explained_variance_ratio_ = \
            self.explained_variance_ / total_var.sum()
        self.singular_values_ = S.copy()  # Store the singular values.
        if self.n_components_ < min(n_features, n_samples):
            self.noise_variance_ = (total_var.sum() -
                                    self.explained_variance_.sum())
            self.noise_variance_ /= min(n_features, n_samples) - n_components
        else:
            self.noise_variance_ = 0.

        return U, S, V
Exemplo n.º 18
0
    def fit(self, X, Y):
        # copy since this will contains the centered data
        check_consistent_length(X, Y)
        X = check_array(X, dtype=np.float64, copy=self.copy)
        Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)
        if Y.ndim == 1:
            Y = Y.reshape(-1, 1)

        if self.n_components > max(Y.shape[1], X.shape[1]):
            raise ValueError("Invalid number of components n_components=%d"
                             " with X of shape %s and Y of shape %s."
                             % (self.n_components, str(X.shape), str(Y.shape)))

        # Scale (in place)
        X, Y, self.x_mean_, self.y_mean_, self.x_std_, self.y_std_ = (
            _center_scale_xy(X, Y, self.scale))
        # svd(X'Y)
        C = np.dot(X.T, Y)

        # The arpack svds solver only works if the number of extracted
        # components is smaller than rank(X) - 1. Hence, if we want to extract
        # all the components (C.shape[1]), we have to use another one. Else,
        # let's use arpacks to compute only the interesting components.
        if self.n_components >= np.min(C.shape):
            U, s, V = svd(C, full_matrices=False)
        else:
            U, s, V = svds(C, k=self.n_components)
        # Deterministic output
        U, V = svd_flip(U, V)
        V = V.T
        self.x_scores_ = np.dot(X, U)
        self.y_scores_ = np.dot(Y, V)
        self.x_weights_ = U
        self.y_weights_ = V
        return self
Exemplo n.º 19
0
   def cv(self, factor,
                split_val,
                shadow_func=None,
                shadow_to_val=None,
                del_freq=None):
      """
         Cross-validate prediction of factor 'factor'.
      """

      self._prepare(factor,
                    split_val,
                    shadow_func=shadow_func,
                    shadow_to_val=shadow_to_val,
                    del_freq=del_freq)

      fac_ind = self.col_names.index(factor)
      self.clf = KNNC(40, algorithm='brute', metric='cosine')
      z=self._get_features_only(self.non_null_set).astype(float)
      target = np.ravel(self.non_null_set.getcol(fac_ind).todense())
      u, s, v = linalg.svds(z, k=51)
      T = u.dot(np.diag(s))

      kf = cross_validation.KFold(len(target), 5)
      for train_idx, test_idx in kf:
         #print len(train_idx), len(test_idx)
         self.clf.fit(T[train_idx], target[train_idx])
         r = self.clf.predict(T[test_idx])
         print 'Average error:',\
               np.mean(np.abs(r - target[test_idx])),\
               "+/-",\
               np.std(np.abs(r - target[test_idx]))
Exemplo n.º 20
0
def rank_trunc(gram_mat, k, fast=True):
    """
    k-th order approximation of the Gram Matrix G.

    Parameters
    ----------
    gram_mat : array, shape (n_samples, n_samples)
        the Gram matrix
    k : int
        the order approximation
    fast : bool
        use svd (if False) or svds (if True).

    Return
    ------
    gram_mat_k : array, shape (n_samples, n_samples)
        The rank k Gram matrix.
    """
    if fast:
        u,s,v=svds(gram_mat,k)
        # TODO Question 2-3
    else:
        U,S,V=svd(gram_mat) #full by default--> both U,V: [nxn] here (for G=<Gram_matrix>)
        s=S[:k]
        u=U[:k,:k]
        v=V[:k,:k]
    gram_mat_k = (u.dot(np.diag(s))).dot(v)
    return gram_mat_k, u, s
def mySVD(train, test, k):
    user_ratings_mean = np.mean(train, axis = 1)
    R_demeaned = train - user_ratings_mean.reshape(-1, 1)
    U, sigma, Vt = svds(R_demeaned, k=k)
    sigma = np.diag(sigma)
    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
    return get_mae(all_user_predicted_ratings, train), get_mae(all_user_predicted_ratings, test)
Exemplo n.º 22
0
    def reducedim_svd(self,factors):
        #print self.fullmatrix
        print "Number of factors is "+str(factors)

        ut,s,vt = svds(self.fullmatrix,factors)

        if numpy.isnan(numpy.min(s)):
            print "Warning: diagonal matrix contains NaNs"
            #s=FixNaNs(s)
            s[numpy.isnan(s)]=0
            if numpy.isnan(numpy.min(s)):
                print "Error: diagonal matrix still contains NaNs, exiting"
                exit(1)

        print "Completed svd routine"

        self.reducedmatrix=numpy.dot(ut,numpy.diag(s))
        print "Computed reduced vector space"

        #remove negative numbers - make equal to zero
        self.reducedmatrix[self.reducedmatrix<0]=0

#        print self.reducedmatrix
        for vector in self.vectordict.values():
            vector.array=sparse.csc_matrix(self.reducedmatrix[vector.rowindex])
        print "Stored individual vectors"
Exemplo n.º 23
0
def n_of_modules_markov(W):

    K = Kmatrix(W)
    v = W.sum()
    n, m = K.shape
    r = min(n, m)
    exp_value = n * m / float(v)
    u,svs,v = linalg.svds(K,r - 2)
    svs = svs[::-1]
    eigs = np.power(svs,2)
    eigs = eigs[1:] # remove the unitary sv
    p = np.zeros((r - 1,))

    for j in xrange(r - 1):
        sigma = eigs.sum() 
        p[j]  = min(exp_value / sigma,1.)
        eigs = eigs[1:] # remove the largest sv

    pdiff = p[1:] - p[:-1]

    try: 
        delta  = pdiff.argmax() 
       
    except ValueError:
        delta = -1
    
    q = 2 + delta # the first addend is explained as follows: there is at least one module & python index starts at 0

    return q, pdiff,p, svs
Exemplo n.º 24
0
def reduce_dims(A, k, return_svs=False):
    '''
    Reduces original vectors to 'k' dimensions by using
    Singular Value Decomposition to reduce the modified
    A matrix.

    Parameters
    -----
    A : scipy lil_matrix
        modified counts matrix
    k : int
        number of reduced dimensions
    return_svs : bool, default is False
        when True, returns the array of singular values
        for each reduced dimension

    Returns
    -----
    Aprime : matrix
        A matrix reduced to k dimensions
    s : array
        singular values for each reduced dimension
        only returned if return_svs=True
    '''
    u, s, vt = linalg.svds(A, k=k, return_singular_vectors='vh')
    Aprime = np.dot(np.diag(s), vt).T
    if return_svs:
        return Aprime, s
    else:
        return Aprime
Exemplo n.º 25
0
def get_coords( axes = 'gene', 
                rows = None,
                time_val = None,
                spatial_idxs = None,
                ids = None):
    bdnet = nio.getBDTNP()
    gene_matrix = array([v['vals'][:,time_val] 
                         for v in bdnet.values() 
                         if str(time_val + 1) in v['steps']])
    gene_matrix_keys = [k 
                for k in bdnet.keys() if str(time_val +1) in v['steps']]

    if axes == 'gene':
        import scipy.sparse as ssp
        import scipy.sparse.linalg as las
        import scipy.sparse.lil as ll
        adj = ssp.csr_matrix(gene_matrix.T)
        n_c = 3
        U,s, Vh = svd = las.svds(adj, n_c)
        filtered_genes = ll.lil_matrix(U)*ll.lil_matrix(diag(s)) *ll.lil_matrix(Vh)        
        xs_gene  = U[ids,0]
        ys_gene  = U[ids,1]
        zs_gene  = U[ids,2]
        
    elif axes == 'space':
        space_space =array([[ [r[idxs]  for idxs in sidxs]
                              for sidxs in spatial_idxs] 
                            for r in rows])
        space_space  = space_space[:, : , time_val]
    
        xs_gene = space_space[ids, 0]
        ys_gene = space_space[ids, 1]
        zs_gene = space_space[ids, 2]
    return xs_gene, ys_gene, zs_gene
Exemplo n.º 26
0
def write_svd(norm_matrix, rank, prefix):
  u,s,vt = svds(norm_matrix, k=rank)
  u = u[:,::-1][:,:rank]
  s = s[::-1][:rank]
  v = vt.T[:,::-1][:,:rank]
  for mat, name in ((u, 'u'), (s, 's'), (v, 'v')):
    np.save("%s_%s.npy" % (prefix, name, ), mat)
Exemplo n.º 27
0
def spectral_partition(W,q,method = 'complete', metric = 'cosine'):

    n,m = W.shape
    K = Kmatrix(W)

    if n == m:
        try:
            e,v = linalg.eigen(K, q)
        except TypeError:
            e,v = linalg.eigs(K, q)

    else:
        try:
            u,e,v = linalg.svds(K, q)
        except AttributeError:
            u,e,v = linalg.svd(K, q)
           
        v = np.concatenate((u, v.T), 0)
                
    max_index = e.argmax()
    v = np.delete(v,max_index,1)
    Obs = np.real(v)
    D = distance.pdist(Obs,metric = metric)
    D = np.multiply(D >= 0, D)
    Z = linkage(D, method = method, metric = metric)
    cluster = fcluster(Z, q, criterion = 'maxclust')
            
    cluster += - 1
    cluster = {'spectral' : cluster}

    return cluster
Exemplo n.º 28
0
def get_svs(documents, k=50):
    '''
    Returns the k singular values of the modified counts matrix.
    These values can be plotted to determine the optimal k value
    for LSI.

    Parameters
    -----
    documents : array of vectors
        each vector is given as the non-zero indices of
        the unreduced vector
        > ex: The vector [0 0 2 0 1] should ve input
        >     as [2, 2, 4]
    k : int, default is 50
        number of singular values

    Returns
    -----
    s : array
        array of k singular values
    '''
    # build counts matrix
    A = build_counts(documents)
    # modify using TF-IDF
    A2 = tfidf(A)
    # reduce using SVD
    s = linalg.svds(A2, k=k, return_singular_vectors=False)
    # return singular values
    return s[::-1]
Exemplo n.º 29
0
    def fit(self, t1, t2):
        assert self.Ais != None, "!!!! First, distribute rows of A using disRand or disBAM"
        d = self.d # number of distributed matrice
        m = np.shape(self.A)[1] # dimension of row space
        Bis = [None] * d # outputs of local PCA
        Atis = [None] * d # rank t1 approximation of each Ai

        # local PCA
        for i in range(d):
            # U, S, Vh = svd(Ais[i])
            # Bis[i] = np.diag(S[:t1]).dot(Vh[:t1,:])
            # Atis[i] = U[:,:t1].dot(Bis[i])
            ni = self.Ais[i].shape[0]
            if t1 < ni: # Target rank t1 is less than Number of Rows
                U, S, Vt = svds(self.Ais[i], k=t1)
                Bis[i] = np.diag(S).dot(Vt)
                Atis[i] = U.dot(Bis[i])
            else: # Number of Rows is less than t1
                U, S, Vt = svd(self.Ais[i])
                Bis[i] = np.diag(S[:ni]).dot(Vt[:ni,:])
                Atis[i] = self.Ais[i]
    
        # global PCA
        K = np.zeros((m,m))
        for i in range(d):
            K += Bis[i].T.dot(Bis[i])
        
        # L,Q = eig(K)
        # C = Q[:,:t2]
        # C = C.real
        L,Q = eigs(K, k=t2)
        self.C = Q.real
        self.Bis = Bis
        self.Atis = Atis
Exemplo n.º 30
0
    def cluster_fps(self):
        clkg = hcluster.linkage(self.dm,method = 'average') 
        coarse_r = hcluster.fcluster(clkg,0.3,criterion = 'distance')
        self.coarse_r = coarse_r

        bcount = np.bincount(coarse_r)
        knum = len(np.nonzero(bcount > 1)[0])

        s = self.density_matrix.shape
        if False and len(s) >1 and s[0] > 10 and s[1] > 10 and knum < min(s) / 2:
            (u,s,vt) = la.svds(self.sps_matrixs,k = knum)
            self.u = u
            print '============'
        else:
            
            self.result = self.coarse_r
            return (clkg,clkg)
 

#rankA = npla.matrix_rank(self.sps_matrixs)
#        if rankA < 3:
        a = np.matrix(np.diag(s)) * np.matrix(vt)
        pd = dist.pdist(np.array(a.T),'cosine')
        pd[np.abs(pd) < 1e-11] = 0
        lkg = hcluster.linkage(pd,method = 'average')
        self.lkg = lkg

        self.result = hcluster.fcluster(lkg,self.svd_cluster_thr,criterion = 'distance')

#        self.result = hcluster.fcluster(lkg,1)

# self.result = hcluster.fclusterdata(u,0.7,metric = 'cosine', criterion = 'distance',method = 'average')
        return (lkg,clkg)
Exemplo n.º 31
0
def cli():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--nfo',
        action='store_true',
        help='''Compute or plot the singular-value decomposition of the
                        near-field operator (NFO).''')
    parser.add_argument(
        '--lso',
        action='store_true',
        help='''Compute or plot the singular-value decomposition of the
                        Lippmann-Schwinger operator (LSO).''')
    parser.add_argument(
        '--numVals',
        '-k',
        type=int,
        help='''Specify the number of singular values/vectors to compute.
                        Must a positive integer between 1 and the order of the square
                        input matrix.''')
    parser.add_argument(
        '--domain',
        '-d',
        type=str,
        choices=['time', 'freq'],
        help='''Specify whether to compute the singular-value decomposition in
                        the time domain or frequency domain. Default is set to frequency domain
                        for faster, more accurate performance.''')
    parser.add_argument(
        '--plot',
        '-p',
        action='store_true',
        help='''Plot the computed singular values and vectors.''')
    parser.add_argument(
        '--format',
        '-f',
        type=str,
        default='pdf',
        choices=['png', 'pdf', 'ps', 'eps', 'svg'],
        help=
        '''Specify the image format of the saved file. Accepted formats are png, pdf,
                        ps, eps, and svg. Default format is set to pdf.''')
    parser.add_argument(
        '--mode',
        type=str,
        choices=['light', 'dark'],
        required=False,
        help='''Specify whether to view plots in light mode for daytime viewing
                        or dark mode for nighttime viewing.
                        Mode must be either \'light\' or \'dark\'.''')
    args = parser.parse_args()

    if args.nfo and not args.lso:
        operatorType = 'near-field operator'
        inputType = 'data'
        try:
            SVD = np.load('NFO_SVD.npz')
            s = SVD['s']
            Uh = SVD['Uh']
            V = SVD['V']
            domain = SVD['domain']

        except FileNotFoundError:
            s, Uh, V, domain = None, None, None, 'freq'

    elif not args.nfo and args.lso:
        operatorType = 'Lippmann-Schwinger operator'
        inputType = 'test functions'
        try:
            SVD = np.load('LSO_SVD.npz')
            s = SVD['s']
            Uh = SVD['Uh']
            V = SVD['V']
            domain = SVD['domain']

        except FileNotFoundError:
            s, Uh, V, domain = None, None, None, 'freq'

    elif args.nfo and args.lso:
        sys.exit(
            textwrap.dedent('''
                UsageError: Please specify only one of the arguments \'--nfo\' or \'--lso\'.
                '''))

    else:
        sys.exit(
            textwrap.dedent('''
                For which operator would you like to compute or plot a singular-value decomposition?
                Enter:
                    
                    vzsvd --nfo
                
                for the near-field operator or
                
                    vzsvd --lso
                    
                for the Lippmann-Schwinger operator.
                '''))

    #==============================================================================
    # if an SVD already exists...
    if any(v is not None for v in
           [s, Uh, V]) and args.numVals is not None and args.plot is True:
        if args.numVals >= 1 and args.numVals == len(s):
            userResponded = False
            print(
                textwrap.dedent('''
                 A singular-value decomposition of the {s} for {n} values/vectors already exists. 
                 What would you like to do?
                 
                 Enter '1' to specify a new number of values/vectors to compute. (Default)
                 Enter '2' to recompute a singular-value decomposition for {n} values/vectors.
                 Enter 'q/quit' to exit.
                 '''.format(s=operatorType, n=args.numVals)))
            while userResponded == False:
                answer = input('Action: ')
                if answer == '' or answer == '1':
                    k = int(
                        input(
                            'Please specify the number of singular values/vectors to compute: '
                        ))
                    if isValid(k):
                        print('Proceeding with numVals = %s...' % (k))
                        userResponded = True
                        computeSVD = True
                        break
                    else:
                        break
                elif answer == '2':
                    k = args.numVals
                    print(
                        'Recomputing SVD of the %s for %s singular values/vectors...'
                        % (operatorType, k))
                    userResponded = True
                    computeSVD = True
                elif answer == 'q' or answer == 'quit':
                    sys.exit('Exiting program.\n')
                else:
                    print(
                        'Invalid response. Please enter \'1\', \'2\', or \'q/quit\'.'
                    )

        elif args.numVals >= 1 and args.numVals != len(s):
            k = args.numVals
            computeSVD = True

        elif args.numVals < 1:
            userResponded = False
            print(
                textwrap.dedent('''
                 ValueError: Argument '-k/--numVals' must be a positive integer 
                 between 1 and the order of the square input matrix. The parameter will
                 be set to the default value of 6.
                 What would you like to do?
                 
                 Enter '1' to specify a value of the parameter. (Default)
                 Enter '2' to proceed with the default value.
                 Enter 'q/quit' exit the program.
                 '''))
            while userResponded == False:
                answer = input('Action: ')
                if answer == '' or answer == '1':
                    k = int(
                        input(
                            'Please specify the number of singular values/vectors to compute: '
                        ))
                    if isValid(k):
                        print('Proceeding with numVals = %s...' % (k))
                        userResponded = True
                        computeSVD = True
                        break
                    else:
                        break
                elif answer == '2':
                    k = 6
                    print('Proceeding with the default value numVals = %s...' %
                          (k))
                    computeSVD = True
                    userResponded = True
                    break
                elif answer == 'q' or answer == 'quit':
                    sys.exit('Exiting program.\n')
                else:
                    print(
                        'Invalid response. Please enter \'1\', \'2\', or \'q/quit\'.'
                    )

    elif all(v is not None for v in
             [s, Uh, V]) and args.numVals is None and args.plot is True:
        computeSVD = False

    elif all(v is not None for v in
             [s, Uh, V]) and args.numVals is not None and args.plot is False:
        if args.numVals >= 1 and args.numVals == len(s):
            userResponded = False
            print(
                textwrap.dedent('''
                 A singular-value decomposition of the {s} for {n} values/vectors already exists. 
                 What would you like to do?
                 
                 Enter '1' to specify a new number of values/vectors to compute. (Default)
                 Enter '2' to recompute a singular-value decomposition for {n} values/vectors.
                 Enter 'q/quit' to exit.
                 '''.format(s=operatorType, n=args.numVals)))
            while userResponded == False:
                answer = input('Action: ')
                if answer == '' or answer == '1':
                    k = int(
                        input(
                            'Please specify the number of singular values/vectors to compute: '
                        ))
                    if isValid(k):
                        print('Proceeding with numVals = %s...' % (k))
                        userResponded = True
                        computeSVD = True
                        break
                    else:
                        break
                elif answer == '2':
                    k = args.numVals
                    print(
                        'Recomputing SVD of the %s for %s singular values/vectors...'
                        % (operatorType, k))
                    userResponded = True
                    computeSVD = True
                elif answer == 'q' or answer == 'quit':
                    sys.exit('Exiting program.\n')
                else:
                    print(
                        'Invalid response. Please enter \'1\', \'2\', or \'q/quit\'.'
                    )

        elif args.numVals >= 1 and args.numVals != len(s):
            k = args.numVals
            computeSVD = True

        elif args.numVals < 1:
            userResponded = False
            print(
                textwrap.dedent('''
                 ValueError: Argument '-k/--numVals' must be a positive integer 
                 between 1 and the order of the square input matrix. The parameter will
                 be set to the default value of 6.
                 What would you like to do?
                 
                 Enter '1' to specify a value of the parameter. (Default)
                 Enter '2' to proceed with the default value.
                 Enter 'q/quit' exit the program.
                 '''))
            while userResponded == False:
                answer = input('Action: ')
                if answer == '' or answer == '1':
                    k = int(
                        input(
                            'Please specify the number of singular values/vectors to compute: '
                        ))
                    if isValid(k):
                        print('Proceeding with numVals = %s...' % (k))
                        userResponded = True
                        computeSVD = True
                        break
                    else:
                        break
                elif answer == '2':
                    k = 6
                    print('Proceeding with the default value numVals = %s...' %
                          (k))
                    computeSVD = True
                    userResponded = True
                    break
                elif answer == 'q' or answer == 'quit':
                    sys.exit('Exiting program.\n')
                else:
                    print(
                        'Invalid response. Please enter \'1\', \'2\', or \'q/quit\'.'
                    )

    elif all(v is not None for v in
             [s, Uh, V]) and args.numVals is None and args.plot is False:
        sys.exit(
            textwrap.dedent('''
                No action specified. A singular-value decomposition of the %s
                for %s values/vectors already exists. Please specify at least one of '-k/--numVals'
                or '-p/--plot' arguments with 'vzsvd' command.
                ''' % (operatorType, len(s))))
    #==============================================================================
    # if an SVD does not already exist...
    elif any(v is None for v in
             [s, Uh, V]) and args.numVals is not None and args.plot is True:
        if args.numVals >= 1:
            computeSVD = True
            k = args.numVals

        elif args.numVals < 1:
            userResponded = False
            print(
                textwrap.dedent('''
                 ValueError: Argument '-k/--numVals' must be a positive integer 
                 between 1 and the order of the square input matrix. The parameter will
                 be set to the default value of 6.
                 What would you like to do?
                 
                 Enter '1' to specify a value of the parameter. (Default)
                 Enter '2' to proceed with the default value.
                 Enter 'q/quit' exit the program.
                 '''))
            while userResponded == False:
                answer = input('Action: ')
                if answer == '' or answer == '1':
                    k = int(
                        input(
                            'Please specify the number of singular values/vectors to compute: '
                        ))
                    if isValid(k):
                        print('Proceeding with numVals = %s...' % (k))
                        userResponded = True
                        computeSVD = True
                        break
                    else:
                        break
                elif answer == '2':
                    k = 6
                    print('Proceeding with the default value numVals = %s...' %
                          (k))
                    computeSVD = True
                    userResponded = True
                    break
                elif answer == 'q' or answer == 'quit':
                    sys.exit('Exiting program.\n')
                else:
                    print(
                        'Invalid response. Please enter \'1\', \'2\', or \'q/quit\'.'
                    )

    elif any(v is None for v in
             [s, Uh, V]) and args.numVals is None and args.plot is True:
        userResponded = False
        print(
            textwrap.dedent('''
             PlotError: A singular-value decomposition of the {s} does not exist. A plot will be
             generated after a singular-value decomposition has been computed.
             
             Enter '1' to specify a number of singular values/vectors to compute. (Default)
             Enter 'q/quit' to exit.
             '''.format(s=operatorType)))
        while userResponded == False:
            answer = input('Action: ')
            if answer == '' or answer == '1':
                k = int(
                    input(
                        'Please specify the number of singular values/vectors to compute: '
                    ))
                if isValid(k):
                    print('Proceeding with numVals = %s...' % (k))
                    userResponded = True
                    computeSVD = True
                    break
                else:
                    break
            elif answer == 'q' or answer == 'quit':
                sys.exit('Exiting program.\n')
            else:
                print('Invalid response. Please enter \'1\', or \'q/quit\'.')

    elif any(v is None for v in
             [s, Uh, V]) and args.numVals is not None and args.plot is False:
        if args.numVals >= 1:
            k = args.numVals
            computeSVD = True

        elif args.numVals < 1:
            userResponded = False
            print(
                textwrap.dedent('''
                 ValueError: Argument '-k/--numVals' must be a positive integer 
                 between 1 and the order of the square input matrix. The parameter will
                 be set to the default value of 6.
                 What would you like to do?
                 
                 Enter '1' to specify a value of the parameter. (Default)
                 Enter '2' to proceed with the default value.
                 Enter 'q/quit' exit the program.
                 '''))
            while userResponded == False:
                answer = input('Action: ')
                if answer == '' or answer == '1':
                    k = int(
                        input(
                            'Please specify the number of singular values/vectors to compute: '
                        ))
                    if isValid(k):
                        print('Proceeding with numVals = %s...' % (k))
                        userResponded = True
                        computeSVD = True
                        break
                    else:
                        break
                elif answer == '2':
                    k = 6
                    print('Proceeding with the default value numVals = %s...' %
                          (k))
                    computeSVD = True
                    userResponded = True
                    break
                elif answer == 'q' or answer == 'quit':
                    sys.exit('Exiting program.\n')
                else:
                    print(
                        'Invalid response. Please enter \'1\', \'2\', or \'q/quit\'.'
                    )

    elif any(v is None for v in
             [s, Uh, V]) and args.numVals is None and args.plot is False:
        sys.exit(
            textwrap.dedent('''
                Nothing to be done. A singular-value decomposition of the {s} does not exist.
                Please specify at least one of '-k/--numVals' or '-p/--plot'
                arguments with 'vzsvd' command.
                '''.format(s=operatorType)))
    #==============================================================================
    # Read in data files
    datadir = np.load('datadir.npz')
    receiverPoints = np.load(str(datadir['receivers']))
    recordingTimes = np.load(str(datadir['recordingTimes']))
    dt = recordingTimes[1] - recordingTimes[0]

    if Path('window.npz').exists():
        windowDict = np.load('window.npz')

        # Apply the receiver window
        rstart = windowDict['rstart']
        rstop = windowDict['rstop']
        rstep = windowDict['rstep']

        # Apply the time window
        tstart = windowDict['tstart']
        tstop = windowDict['tstop']
        tstep = windowDict['tstep']

        # Convert time window parameters to corresponding array indices
        Tstart = int(round(tstart / dt))
        Tstop = int(round(tstop / dt))

    else:
        rstart = 0
        rstop = receiverPoints.shape[0]
        rstep = 1

        tstart = recordingTimes[0]
        tstop = recordingTimes[-1]

        Tstart = 0
        Tstop = len(recordingTimes)
        tstep = 1

    # Apply the receiver window
    rinterval = np.arange(rstart, rstop, rstep)
    receiverPoints = receiverPoints[rinterval, :]

    # Apply the time window
    tinterval = np.arange(Tstart, Tstop, tstep)
    recordingTimes = recordingTimes[tinterval]

    # Used for getting time and frequency units
    if Path('plotParams.pkl').exists():
        plotParams = pickle.load(open('plotParams.pkl', 'rb'))
    else:
        plotParams = default_params()

    if computeSVD:
        # get time units for printing time windows or time shifts
        tu = plotParams['tu']

        if args.nfo:

            if Path('noisyData.npz').exists():
                userResponded = False
                print(
                    textwrap.dedent('''
                      Detected that band-limited noise has been added to the data array.
                      Would you like to compute an SVD of the noisy data? ([y]/n)
                      
                      Enter 'q/quit' exit the program.
                      '''))
                while userResponded == False:
                    answer = input('Action: ')
                    if answer == '' or answer == 'y' or answer == 'yes':
                        print(
                            'Proceeding with singular-value decomposition using noisy data...'
                        )
                        # read in the noisy data array
                        X = np.load('noisyData.npz')['noisyData']
                        userResponded = True
                    elif answer == 'n' or answer == 'no':
                        print(
                            'Proceeding with singular-value decomposition using noise-free data...'
                        )
                        # read in the recorded data array
                        X = np.load(str(datadir['recordedData']))
                        userResponded = True
                    elif answer == 'q' or answer == 'quit':
                        sys.exit('Exiting program.\n')
                    else:
                        print(
                            'Invalid response. Please enter \'y/yes\', \'n\no\', or \'q/quit\'.'
                        )

            else:
                # read in the recorded data array
                X = np.load(str(datadir['recordedData']))

            if Path('window.npz').exists():
                print('Detected user-specified window:\n')

                # For display/printing purposes, count receivers with one-based
                # indexing. This amounts to incrementing the rstart parameter by 1
                print('window @ receivers : start =', rstart + 1)
                print('window @ receivers : stop =', rstop)
                print('window @ receivers : step =', rstep, '\n')

                if tu != '':
                    print('window @ time : start = %0.2f %s' % (tstart, tu))
                    print('window @ time : stop = %0.2f %s' % (tstop, tu))
                else:
                    print('window @ time : start =', tstart)
                    print('window @ time : stop =', tstop)
                print('window @ time : step =', tstep, '\n')

                # Apply the source window
                slabel = windowDict['slabel']
                sstart = windowDict['sstart']
                sstop = windowDict['sstop']
                sstep = windowDict['sstep']
                sinterval = np.arange(sstart, sstop, sstep)

                # For display/printing purposes, count recordings/sources with one-based
                # indexing. This amounts to incrementing the sstart parameter by 1
                print('window @ %s : start = %s' % (slabel, sstart + 1))
                print('window @ %s : stop = %s' % (slabel, sstop))
                print('window @ %s : step = %s\n' % (slabel, sstep))

                print('Applying window to data volume...')
                X = X[rinterval, :, :]
                X = X[:, tinterval, :]
                X = X[:, :, sinterval]
                Nr, Nt, Ns = X.shape

                # Apply tapered cosine (Tukey) window to time signals.
                # This ensures the fast fourier transform (FFT) used in
                # the definition of the matrix-vector product below is
                # acting on a function that is continuous at its edges.

                peakFreq = pulseFun.peakFreq
                # Np : Number of samples in the dominant period T = 1 / peakFreq
                Np = int(round(1 / (tstep * dt * peakFreq)))
                # alpha is set to taper over 6 of the dominant period of the
                # pulse function (3 periods from each end of the signal)
                alpha = 6 * Np / Nt
                print('Tapering time signals with Tukey window: %d' %
                      (int(round(alpha * 100))) + '%')
                TukeyWindow = tukey(Nt, alpha)
                X *= TukeyWindow[None, :, None]

            else:
                Nr, Nt, Ns = X.shape

        elif args.lso:

            if Path('samplingGrid.npz').exists():
                samplingGrid = np.load('samplingGrid.npz')
                x = samplingGrid['x']
                y = samplingGrid['y']
                tau = samplingGrid['tau']
                if 'z' in samplingGrid:
                    z = samplingGrid['z']
                else:
                    z = None

            else:
                sys.exit(
                    textwrap.dedent('''
                        A sampling grid needs to be set up before computing a
                        singular-value decomposition of the %s.
                        Enter:
                            
                            vzgrid --help
                            
                        from the command-line for more information on how to set up a
                        sampling grid.
                        ''' % (operatorType)))

            pulse = lambda t: pulseFun.pulse(t)
            velocity = pulseFun.velocity
            peakFreq = pulseFun.peakFreq
            peakTime = pulseFun.peakTime

            if Path('VZTestFuncs.npz').exists():
                print(
                    '\nDetected that free-space test functions have already been computed...'
                )
                print(
                    'Checking consistency with current space-time sampling grid...'
                )
                TFDict = np.load('VZTestFuncs.npz')

                if samplingIsCurrent(TFDict, receiverPoints, recordingTimes,
                                     velocity, tau, x, y, z, peakFreq,
                                     peakTime):
                    X = TFDict['TFarray']
                    sourcePoints = TFDict['samplingPoints']
                    print('Moving forward to SVD...')

                else:
                    print('Recomputing test functions...')
                    # set up the convolution times based on length of recording time interval
                    T = recordingTimes[-1] - recordingTimes[0]
                    convolutionTimes = np.linspace(-T, T,
                                                   2 * len(recordingTimes) - 1)

                    if tau[0] != 0:
                        if tu != '':
                            print(
                                'Recomputing test functions for focusing time %0.2f %s...'
                                % (tau[0], tu))
                        else:
                            print(
                                'Recomputing test functions for focusing time %0.2f...'
                                % (tau[0]))
                        X, sourcePoints = sampleSpace(
                            receiverPoints, convolutionTimes - tau[0],
                            velocity, x, y, z, pulse)
                    else:
                        X, sourcePoints = sampleSpace(receiverPoints,
                                                      convolutionTimes,
                                                      velocity, x, y, z, pulse)

                    if z is None:
                        np.savez('VZTestFuncs.npz',
                                 TFarray=X,
                                 time=recordingTimes,
                                 receivers=receiverPoints,
                                 peakFreq=peakFreq,
                                 peakTime=peakTime,
                                 velocity=velocity,
                                 x=x,
                                 y=y,
                                 tau=tau,
                                 samplingPoints=sourcePoints)
                    else:
                        np.savez('VZTestFuncs.npz',
                                 TFarray=X,
                                 time=recordingTimes,
                                 receivers=receiverPoints,
                                 peakFreq=peakFreq,
                                 peakTime=peakTime,
                                 velocity=velocity,
                                 x=x,
                                 y=y,
                                 z=z,
                                 tau=tau,
                                 samplingPoints=sourcePoints)

            else:
                print(
                    '\nComputing free-space test functions for the current space-time sampling grid...'
                )
                if tau[0] != 0:
                    if tu != '':
                        print(
                            'Computing test functions for focusing time %0.2f %s...'
                            % (tau[0], tu))
                    else:
                        print(
                            'Computing test functions for focusing time %0.2f...'
                            % (tau[0]))
                    X, sourcePoints = sampleSpace(receiverPoints,
                                                  recordingTimes - tau[0],
                                                  velocity, x, y, z, pulse)
                else:
                    X, sourcePoints = sampleSpace(receiverPoints,
                                                  recordingTimes, velocity, x,
                                                  y, z, pulse)

                if z is None:
                    np.savez('VZTestFuncs.npz',
                             TFarray=X,
                             time=recordingTimes,
                             receivers=receiverPoints,
                             peakFreq=peakFreq,
                             peakTime=peakTime,
                             velocity=velocity,
                             x=x,
                             y=y,
                             tau=tau,
                             samplingPoints=sourcePoints)
                else:
                    np.savez('VZTestFuncs.npz',
                             TFarray=X,
                             time=recordingTimes,
                             receivers=receiverPoints,
                             peakFreq=peakFreq,
                             peakTime=peakTime,
                             velocity=velocity,
                             x=x,
                             y=y,
                             z=z,
                             tau=tau,
                             samplingPoints=sourcePoints)

            Nr, Nt, Ns = X.shape

        #==============================================================================
        if args.domain is not None:
            domain = args.domain

        if domain == 'freq':
            # Transform convolutional operator into frequency domain and bandpass for efficient SVD
            print('Transforming %s to the frequency domain...' % (inputType))
            N = nextPow2(2 * Nt)
            X = np.fft.rfft(X, n=N, axis=1)

            if plotParams['fmax'] is None:
                freqs = np.fft.rfftfreq(N, tstep * dt)
                plotParams['fmax'] = np.max(freqs)

            # Apply the frequency window
            fmin = plotParams['fmin']
            fmax = plotParams['fmax']
            fu = plotParams['fu']  # frequency units (e.g., Hz)

            if fu != '':
                print('Applying bandpass filter: [%0.2f %s, %0.2f %s]' %
                      (fmin, fu, fmax, fu))
            else:
                print('Applying bandpass filter: [%0.2f, %0.2f]' %
                      (fmin, fmax))

            df = 1.0 / (N * tstep * dt)
            startIndex = int(round(fmin / df))
            stopIndex = int(round(fmax / df))

            finterval = np.arange(startIndex, stopIndex, 1)
            X = X[:, finterval, :]

        #==============================================================================
        # Compute the k largest singular values (which='LM') of the operator A
        # Singular values are elements of the vector 's'
        # Left singular vectors are columns of 'U'
        # Right singular vectors are columns of 'V'

        A = asConvolutionalOperator(X)

        if k == 1:
            print('Computing SVD of the %s for 1 singular value/vector...' %
                  (operatorType))
        else:
            print('Computing SVD of the %s for %s singular values/vectors...' %
                  (operatorType, k))
        startTime = time.time()
        U, s, Vh = svds(A, k, which='LM')
        endTime = time.time()
        print('Elapsed time:', humanReadable(endTime - startTime), '\n')

        # sort the singular values and corresponding vectors in descending order
        # (i.e., largest to smallest)
        index = s.argsort()[::-1]
        s = s[index]
        Uh = U[:, index].conj().T
        V = Vh[index, :].conj().T

        # Write binary output with numpy
        if args.nfo:
            np.savez('NFO_SVD.npz', s=s, Uh=Uh, V=V, domain=domain)
        elif args.lso:
            np.savez('LSO_SVD.npz', s=s, Uh=Uh, V=V, domain=domain)

    #==============================================================================
    if args.plot and all(v is not None for v in [s, Uh, V]):

        Nr = receiverPoints.shape[0]
        Nt = len(recordingTimes)

        try:
            k
        except NameError:
            k = len(s)

        if args.domain is not None and domain != args.domain:
            if domain == 'freq':
                s1 = 'time'
                s2 = 'frequency'
            else:
                s1 = 'frequency'
                s2 = 'time'
            sys.exit(
                textwrap.dedent('''
                    Error: Attempted to plot the singular-value decomposition in the %s
                    domain, but the decomposition was computed in the %s domain.
                    ''' % (s1, s2)))

        if domain == 'freq':
            # plot singular vectors in frequency domain
            N = nextPow2(2 * Nt)
            freqs = np.fft.rfftfreq(N, tstep * dt)

            if plotParams['fmax'] is None:
                plotParams['fmax'] = np.max(freqs)

            # Apply the frequency window
            fmin = plotParams['fmin']
            fmax = plotParams['fmax']
            df = 1.0 / (N * tstep * dt)

            startIndex = int(round(fmin / df))
            stopIndex = int(round(fmax / df))
            finterval = np.arange(startIndex, stopIndex, 1)
            freqs = freqs[finterval]
            fmax = freqs[-1]

            M = len(freqs)
            Ns = int(V.shape[0] / M)
            U = np.reshape(Uh.conj().T, (Nr, M, k))
            V = np.reshape(V, (Ns, M, k))

        else:  # domain == 'time'
            M = 2 * Nt - 1
            Ns = int(V.shape[0] / M)
            U = np.reshape(Uh.T, (Nr, M, k))
            V = np.reshape(V, (Ns, M, k))
            T = recordingTimes[-1] - recordingTimes[0]
            times = np.linspace(-T, T, M)

        if args.nfo:  # Near-field operator
            try:
                sinterval
            except NameError:
                if Path('window.npz').exists():
                    sstart = windowDict['sstart']
                    sstop = windowDict['sstop']
                    sstep = windowDict['sstep']
                else:
                    sstart = 0
                    sstop = Ns
                    sstep = 1

                sinterval = np.arange(sstart, sstop, sstep)

            if 'sources' in datadir:
                sourcePoints = np.load(str(datadir['sources']))
                sourcePoints = sourcePoints[sinterval, :]
            else:
                sourcePoints = None

        else:
            # if args.lso (Lippmann-Schwinger operator)

            # in the case of the Lippmann-Schwinger operator, 'sourcePoints'
            # correspond to sampling points, which should always exist.
            try:
                sourcePoints
            except NameError:
                if Path('VZTestFuncs.npz').exists():
                    TFDict = np.load('VZTestFuncs.npz')
                    sourcePoints = TFDict['samplingPoints']
                else:
                    sys.exit(
                        textwrap.dedent('''
                            Error: A sampling grid must exist and test functions computed
                            before a singular-value decomposition of the Lippmann-Schwinger
                            operator can be computed or plotted.
                            '''))

            sstart = 0
            sstop = sourcePoints.shape[0]
            sstep = 1
            sinterval = np.arange(sstart, sstop, sstep)

        # increment source/recording interval and receiver interval to be consistent
        # with one-based indexing (i.e., count from one instead of zero)
        sinterval += 1
        rinterval += 1
        rstart += 1
        sstart += 1

        if args.mode is not None:
            plotParams['view_mode'] = args.mode

        pickle.dump(plotParams, open('plotParams.pkl', 'wb'),
                    pickle.HIGHEST_PROTOCOL)

        remove_keymap_conflicts({'left', 'right', 'up', 'down', 'save'})
        if domain == 'freq':

            # plot the left singular vectors
            fig_lvec, ax_lvec_r, ax_lvec_i = setFigure(
                num_axes=2, mode=plotParams['view_mode'])
            ax_lvec_r.volume = U.real
            ax_lvec_i.volume = U.imag
            ax_lvec_r.index = 0
            ax_lvec_i.index = 0
            fig_lvec.suptitle('Left-Singular Vector',
                              color=ax_lvec_r.titlecolor,
                              fontsize=16)
            fig_lvec.subplots_adjust(bottom=0.27, top=0.86)
            leftTitle_r = vector_title('left', ax_lvec_r.index + 1, 'real')
            leftTitle_i = vector_title('left', ax_lvec_i.index + 1, 'imag')
            for ax, title in zip([ax_lvec_r, ax_lvec_i],
                                 [leftTitle_r, leftTitle_i]):
                left_im = plotFreqVectors(ax, ax.volume[:, :, ax.index], freqs,
                                          fmin, fmax, rstart, rinterval,
                                          receiverPoints, title, 'left',
                                          plotParams)

            lp0 = ax_lvec_r.get_position().get_points().flatten()
            lp1 = ax_lvec_i.get_position().get_points().flatten()
            left_cax = fig_lvec.add_axes([lp0[0], 0.12, lp1[2] - lp0[0], 0.03])
            lcbar = fig_lvec.colorbar(left_im,
                                      left_cax,
                                      orientation='horizontal')
            lcbar.outline.set_edgecolor(ax_lvec_r.cbaredgecolor)
            lcbar.ax.tick_params(axis='x', colors=ax_lvec_r.labelcolor)
            lcbar.ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
            lcbar.set_label('Amplitude',
                            labelpad=5,
                            rotation=0,
                            fontsize=12,
                            color=ax_lvec_r.labelcolor)
            fig_lvec.canvas.mpl_connect(
                'key_press_event', lambda event: process_key_vectors(
                    event, freqs, fmin, fmax, rstart, sstart, rinterval,
                    sinterval, receiverPoints, sourcePoints, plotParams,
                    'cmplx_left'))

            # plot the right singular vectors
            fig_rvec, ax_rvec_r, ax_rvec_i = setFigure(
                num_axes=2, mode=plotParams['view_mode'])
            ax_rvec_r.volume = V.real
            ax_rvec_i.volume = V.imag
            ax_rvec_r.index = 0
            ax_rvec_i.index = 0
            fig_rvec.suptitle('Right-Singular Vector',
                              color=ax_rvec_r.titlecolor,
                              fontsize=16)
            fig_rvec.subplots_adjust(bottom=0.27, top=0.86)
            rightTitle_r = vector_title('right', ax_rvec_r.index + 1, 'real')
            rightTitle_i = vector_title('right', ax_rvec_i.index + 1, 'imag')
            for ax, title in zip([ax_rvec_r, ax_rvec_i],
                                 [rightTitle_r, rightTitle_i]):
                right_im = plotFreqVectors(ax, ax.volume[:, :, ax.index],
                                           freqs, fmin, fmax, sstart,
                                           sinterval, sourcePoints, title,
                                           'right', plotParams)

            rp0 = ax_rvec_r.get_position().get_points().flatten()
            rp1 = ax_rvec_i.get_position().get_points().flatten()
            right_cax = fig_rvec.add_axes(
                [rp0[0], 0.12, rp1[2] - rp0[0], 0.03])
            rcbar = fig_rvec.colorbar(right_im,
                                      right_cax,
                                      orientation='horizontal')
            rcbar.outline.set_edgecolor(ax_rvec_r.cbaredgecolor)
            rcbar.ax.tick_params(axis='x', colors=ax_rvec_r.labelcolor)
            rcbar.ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
            rcbar.set_label('Amplitude',
                            labelpad=5,
                            rotation=0,
                            fontsize=12,
                            color=ax_lvec_r.labelcolor)
            fig_rvec.canvas.mpl_connect(
                'key_press_event', lambda event: process_key_vectors(
                    event, freqs, fmin, fmax, rstart, sstart, rinterval,
                    sinterval, receiverPoints, sourcePoints, plotParams,
                    'cmplx_right'))

        else:
            # domain == 'time'
            fig_vec, ax_lvec, ax_rvec = setFigure(num_axes=2,
                                                  mode=plotParams['view_mode'])

            ax_lvec.volume = U
            ax_lvec.index = 0
            leftTitle = vector_title('left', ax_lvec.index + 1)
            plotWiggles(ax_lvec, ax_lvec.volume[:, :, ax_lvec.index], times,
                        -T, T, rstart, rinterval, receiverPoints, leftTitle,
                        'left', plotParams)

            ax_rvec.volume = V
            ax_rvec.index = 0
            rightTitle = vector_title('right', ax_rvec.index + 1)
            plotWiggles(ax_rvec, ax_rvec.volume[:, :, ax_rvec.index], times,
                        -T, T, sstart, sinterval, sourcePoints, rightTitle,
                        'right', plotParams)
            fig_vec.tight_layout()
            fig_vec.canvas.mpl_connect(
                'key_press_event', lambda event: process_key_vectors(
                    event, times, -T, T, rstart, sstart, rinterval, sinterval,
                    receiverPoints, sourcePoints, plotParams))
        #==============================================================================
        # plot the singular values
        # figure and axis for singular values
        fig_vals, ax_vals = setFigure(num_axes=1, mode=plotParams['view_mode'])

        n = np.arange(1, k + 1, 1)
        kappa = s[0] / s[-1]  # condition number = max(s) / min(s)
        ax_vals.plot(n,
                     s,
                     '.',
                     clip_on=False,
                     markersize=9,
                     label=r'Condition Number: %0.1e' % (kappa),
                     color=ax_vals.pointcolor)
        ax_vals.set_xlabel('n', color=ax_vals.labelcolor)
        ax_vals.set_ylabel('$\sigma_n$', color=ax_vals.labelcolor)
        legend = ax_vals.legend(title='Singular Values',
                                loc='upper center',
                                bbox_to_anchor=(0.5, 1.25),
                                markerscale=0,
                                handlelength=0,
                                handletextpad=0,
                                fancybox=True,
                                shadow=True,
                                fontsize='large')
        legend.get_title().set_fontsize('large')
        ax_vals.set_xlim([1, k])
        ax_vals.set_ylim(bottom=0)
        ax_vals.locator_params(axis='y', nticks=6)
        ax_vals.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
        fig_vals.tight_layout()
        fig_vals.savefig('singularValues.' + args.format,
                         format=args.format,
                         bbox_inches='tight',
                         facecolor=fig_vals.get_facecolor())

        plt.show()
Exemplo n.º 32
0
def hysime(y, n, Rn, verbose=true):

    y = arg[0]
    # 1st parameter is the data set
    L, N = y.shape
    if not np.prod(y.shape):
        raise ValueError("the data set is empty")
    n = arg[1]  # the 2nd parameter is the noise
    Ln, Nn = n.shape
    Rn = arg[2]  # the 3rd parameter is the noise correlation matrix
    d1, d2 = Rn.shape
    #if nargin == 4, verbose = ~strcmp(lower(varargin{4}),'off');

    if ln != L or Nn != N:  # n is an empty matrix or with different size
        raise ValueError("empty noise matrix or its size does",
                         "not agree with size of y\n")
    if d1 != d2 or d1 != L:
        print("Bad noise correlation matrix\n")
        Rn = n * n.getH() / N

    x = y - n

    if verbose:
        print(1, "Computing the correlation matrices\n")
    L, N = y.shape
    Ry = y * y.getH() / N  # sample correlation matrix
    Rx = x * x.getH() / N  # signal correlation matrix estimates
    if verbose:
        print(
            1,
            "Computing the eigen vectors of the signal correlation matrix\n")
    E, D = svds(Rx)  # eigen values of Rx in decreasing order, equation (15)
    dx = block_diag(D)

    if verbose:
        print(1, "Estimating the number of endmembers\n")
    Rn = Rn + np.sum(block_diag(Rx)) / L / 10**10 * np.identity(L)
    Py = block_diag(E.getH() * Ry * E)  # equation (23)

    Pn = block_diag(E.getH() * Rn * E)  # equation (24)
    cost_F = -Py + 2 * Pn  # equation (22)

    # syntax might need revision
    kf = np.sum(cost_F < 0)
    dummy, ind_asc = np.sort(cost_F)
    Ek = E[:, ind_asc[1:kf]]
    if verbose:
        print(1, "The signal subspace dimension is: k = %d\n", kf)

    # only for plot purposes, equation (19)
    Py_sort = np.trace(Ry) - np.cumsum(Py(ind_asc))
    Pn_sort = 2 * np.cumsum(Pn(ind_asc))
    cost_F_sort = Py_sort + Pn_sort

    fig = plt.figure()

    semilogy(indice, cost_F_sort(indice), indice, Py_sort(indice), indice,
             Pn_sort(indice), 2, 5, **kwargs)
    plt.semilogx([1, 10, 100], [1, 10, 100])
    plt.xlabel("k")
    plt.ylabel("mse(k)")
    plt.title('HySime')
    legend('Mean Squared Error', 'Projection Error', 'Noise Power')
    plt.show()
Exemplo n.º 33
0
def make_assoc_dict(deps, minct=100, svd=False, outpath=None):
    #def make_tc_dict(deps, minct=2, laplace=1.0, positive=True, outpath=None):
    #def make_tc_dict(deps, mostcommon=0.8, laplace=1.0, positive=True, outpath=None):
    """PMI. Laplace smooothing not currently implemented.
    """
    ctr = defaultdict(int)  # for keeping joint counts
    wordctr = defaultdict(int)
    ctxctr = defaultdict(int)
    print('Getting counts')
    for triple in tqdm(deps):
        ctr[tuple([triple[0] + '-' + triple[1], triple[2]])] += 1
        ctxctr[triple[0] + '-' + triple[1]] += 1
        wordctr[triple[1]] += 1
        wordctr[triple[2]] += 1

    # Enforcing min. ct.
    ctr = {
        k: v
        for k, v in ctr.items()
        if all(x > minct for x in [wordctr[k[1]], ctxctr[k[0]]])
    }
    wordctr = {k: v for k, v in wordctr.items() if wordctr[k] > minct}
    ctxctr = {k: v for k, v in ctxctr.items() if ctxctr[k] > minct}
    total = sum(v for v in ctr.values())

    print('\n# total triples: {}'.format(total))
    print('# words: {}'.format(len(wordctr)))
    print('# rel-gov pairs: {}\n'.format(len(ctxctr)))

    print('Converting to PMI')
    for k in tqdm(ctr.keys()):
        ctr[k] /= wordctr[k[1]]
        ctr[k] /= ctxctr[k[0]]
        ctr[k] *= total
        ctr[k] = np.log2(ctr[k])

    print('Converting to pandas')
    ctrpd = pd.Series(ctr).reset_index()
    ctrpd.columns = ['Rel-Gov', 'Dep', 'PMI']
    ctrpd['PMI'] = pd.arrays.SparseArray(ctrpd['PMI'])  # Converting to sparse
    if outpath and not svd:
        print('Writing to compressed .csv')
        ctrpd.to_csv(os.path.join(outpath,
                                  'specPMI_' + str(minct) + 'cutoff.csv.gz'),
                     compression='gzip',
                     na_rep=np.nan)
    elif outpath and svd:
        print('Performing SVD w/ {} dimensions'.format(svd))
        print('NOTE: Converting to (sparse) positive PMI for consistency!')
        ctrpd['PMI'].clip(lower=0.0, inplace=True)  # Making it PPMI in place
        #ctrpd = ctrpd.astype(pd.SparseDtype(np.float32, fill_value=0.0))
        ctrpd = ctrpd.pivot(
            index='Dep',
            columns='Rel-Gov',
            values='PMI',
        ).fillna(0.0)
        print('Density: {}'.format(ctrpd.sparse.density))
        wvecs, singvals, cvecs = svds(ctrpd.sparse.to_coo(), k=svd)
        print('Making word and context symmetric (Levy et al. 2015)')
        wsym = wvecs * np.sqrt(singvals)
        # Transposing so context vectors are in the rows
        csym = cvecs.T * np.sqrt(singvals)
        wsym = pd.DataFrame(wsym, index=ctrpd.index)
        csym = pd.DataFrame(csym, index=ctrpd.columns)
        print('Writing to compressed .csv')
        wsym.to_csv(os.path.join(
            outpath, 'WordVecs' + str(minct) + 'SVD' + str(svd) + '.csv.gz'),
                    compression='gzip',
                    na_rep=np.nan)
        csym.to_csv(os.path.join(
            outpath,
            'ContextVecs' + str(minct) + 'SVD' + str(svd) + '.csv.gz'),
                    compression='gzip',
                    na_rep=np.nan)
    return ctrpd
Exemplo n.º 34
0
 def test_svds_input_validation_A(self, args):
     A, error_type, message = args
     with pytest.raises(error_type, match=message):
         svds(A, k=1, solver=self.solver)
Exemplo n.º 35
0
 def test_svds_input_validation_tol_2(self, tol):
     # I think the stack trace is reasonable here
     message = "'<' not supported between instances"
     with pytest.raises(TypeError, match=message):
         svds(np.eye(10), tol=tol, solver=self.solver)
Exemplo n.º 36
0
    def test_svd_linop(self):
        solver = self.solver

        nmks = [(6, 7, 3), (9, 5, 4), (10, 8, 5)]

        def reorder(args):
            U, s, VH = args
            j = np.argsort(s)
            return U[:, j], s[j], VH[j, :]

        for n, m, k in nmks:
            # Test svds on a LinearOperator.
            A = np.random.RandomState(52).randn(n, m)
            L = CheckingLinearOperator(A)

            if solver == 'propack':
                v0 = np.ones(n)
            else:
                v0 = np.ones(min(A.shape))

            U1, s1, VH1 = reorder(svds(A, k, v0=v0, solver=solver))
            U2, s2, VH2 = reorder(svds(L, k, v0=v0, solver=solver))

            assert_allclose(np.abs(U1), np.abs(U2))
            assert_allclose(s1, s2)
            assert_allclose(np.abs(VH1), np.abs(VH2))
            assert_allclose(np.dot(U1, np.dot(np.diag(s1), VH1)),
                            np.dot(U2, np.dot(np.diag(s2), VH2)))

            # Try again with which="SM".
            A = np.random.RandomState(1909).randn(n, m)
            L = CheckingLinearOperator(A)

            # TODO: arpack crashes when v0=v0, which="SM"
            kwargs = {'v0': v0} if solver not in {None, 'arpack'} else {}
            U1, s1, VH1 = reorder(
                svds(A, k, which="SM", solver=solver, **kwargs))
            U2, s2, VH2 = reorder(
                svds(L, k, which="SM", solver=solver, **kwargs))

            assert_allclose(np.abs(U1), np.abs(U2))
            assert_allclose(s1, s2)
            assert_allclose(np.abs(VH1), np.abs(VH2))
            assert_allclose(np.dot(U1, np.dot(np.diag(s1), VH1)),
                            np.dot(U2, np.dot(np.diag(s2), VH2)))

            if k < min(n, m) - 1:
                # Complex input and explicit which="LM".
                for (dt, eps) in [(complex, 1e-7), (np.complex64, 1e-3)]:
                    rng = np.random.RandomState(1648)
                    A = (rng.randn(n, m) + 1j * rng.randn(n, m)).astype(dt)
                    L = CheckingLinearOperator(A)

                    U1, s1, VH1 = reorder(svds(A, k, which="LM",
                                               solver=solver))
                    U2, s2, VH2 = reorder(svds(L, k, which="LM",
                                               solver=solver))

                    assert_allclose(np.abs(U1), np.abs(U2), rtol=eps)
                    assert_allclose(s1, s2, rtol=eps)
                    assert_allclose(np.abs(VH1), np.abs(VH2), rtol=eps)
                    assert_allclose(np.dot(U1, np.dot(np.diag(s1), VH1)),
                                    np.dot(U2, np.dot(np.diag(s2), VH2)),
                                    rtol=eps)
Exemplo n.º 37
0
 def test_svds_input_validation_return_singular_vectors(self, rsv):
     message = "`return_singular_vectors` must be in"
     with pytest.raises(ValueError, match=message):
         svds(np.eye(10), return_singular_vectors=rsv, solver=self.solver)
Exemplo n.º 38
0
 def test_svds_input_validation_maxiter_1(self, maxiter):
     message = ("`maxiter` must be a positive integer.")
     with pytest.raises(ValueError, match=message):
         svds(np.eye(10), maxiter=maxiter, solver=self.solver)
Exemplo n.º 39
0
 def test_svds_input_validation_v0_3(self, v0):
     A = np.ones((10, 10))
     message = "`v0` must be of floating or complex floating data type."
     with pytest.raises(ValueError, match=message):
         svds(A, k=1, v0=v0, solver=self.solver)
Exemplo n.º 40
0
 def test_svds_input_validation_v0_2(self):
     A = np.ones((10, 10))
     v0 = np.ones((1, 10))
     message = "`v0` must have shape"
     with pytest.raises(ValueError, match=message):
         svds(A, k=1, v0=v0, solver=self.solver)
Exemplo n.º 41
0
 def test_svds_input_validation_solver(self, solver):
     message = "solver must be one of"
     with pytest.raises(ValueError, match=message):
         svds(np.ones((3, 4)), k=2, solver=solver)
Exemplo n.º 42
0
 def test_svds_input_validation_tol_1(self, tol):
     message = "`tol` must be a non-negative floating point value."
     with pytest.raises(ValueError, match=message):
         svds(np.eye(10), tol=tol, solver=self.solver)
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        if e < 0.001:
            break
    return P, Q.T

nets_train,nets_test = train_test_split(nets,0.3)
print(nets_train.shape)
print(nets_test.shape)
nets_mean = np.mean(nets_train, axis = 1)
nets_demeaned = nets_train - nets_mean.reshape(-1, 1)
#  print(R_demeaned)
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(nets_demeaned, k = 25)
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + nets_mean.reshape(-1, 1)

print(all_user_predicted_ratings.shape)
print(nets.shape)

from sklearn.metrics import mean_squared_error
def get_mse(pred, actual):
    #  Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

print(sqrt(get_mse(all_user_predicted_ratings,nets_test)))
Exemplo n.º 44
0
# Populate our empty matrix with the cumulative scores between teams
for index, row in df_scores.iterrows():
    
    # get id
    home_id = row['Home_Team_id']
    away_id = row['Away_Team_id']
    
    # convert id to a number in range 0-29
    home_num = df_team_ids.loc[df_team_ids['TEAM_ID'] == home_id]['SV_TEAM_ID']
    away_num = df_team_ids.loc[df_team_ids['TEAM_ID'] == away_id]['SV_TEAM_ID']
    home_num = home_num.iloc[0]
    away_num = away_num.iloc[0]
    
    # add scores to matrix
    df_scores_mat[home_num][away_num] += row['Home_PTS']
    df_scores_mat[away_num][home_num] += row['Visitor_PTS']

scores_mat = df_scores_mat.as_matrix()

# apply SVD
from scipy.sparse.linalg import svds
U, s, V = svds(scores_mat,k=2)

# plot latent vectors
plt.scatter(U[:,0], U[:,1], c=np.arange(0,30))
plt.show()

# scale by eigenvalues and plot again
U2 = np.dot(U,np.sqrt(np.diag(s)))
plt.scatter(U2[:,0], U2[:,1], c=np.arange(0,30))
plt.show()
Exemplo n.º 45
0
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = interactions_train_df.pivot(
    index='personId', columns='contentId', values='eventStrength').fillna(0)

users_items_pivot_matrix_df.head(10)
users_items_pivot_matrix = users_items_pivot_matrix_df.as_matrix()
users_items_pivot_matrix[:10]
users_ids = list(users_items_pivot_matrix_df.index)
users_ids[:10]
users_items_pivot_sparse_matrix = csr_matrix(users_items_pivot_matrix)
users_items_pivot_sparse_matrix
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
#U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
U, sigma, Vt = svds(users_items_pivot_sparse_matrix, k=NUMBER_OF_FACTORS_MF)
U.shape
Vt.shape
sigma = np.diag(sigma)
sigma.shape
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)
all_user_predicted_ratings
all_user_predicted_ratings_norm = (
    all_user_predicted_ratings - all_user_predicted_ratings.min()) / (
        all_user_predicted_ratings.max() - all_user_predicted_ratings.min())
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings_norm,
                           columns=users_items_pivot_matrix_df.columns,
                           index=users_ids).transpose()
cf_preds_df.head(10)
len(cf_preds_df.columns)
Exemplo n.º 46
0
def low_rank_svd(matrix, singular_count=2):
    u, s, vt = svds(matrix, k=singular_count)
    return u, s, vt
Exemplo n.º 47
0
    return pred


item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

from sklearn.metrics import mean_squared_error
from math import sqrt


def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))


print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

sparsity = round(1.0 - len(df) / float(n_users * n_items), 3)
print('The sparsity level of MovieLens100K is ' + str(sparsity * 100) + '%')

import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k=10)
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))
def training():

    db = firestore.client()
    doc_ref = db.collection(u'ratings').where(u'book', u'==', True).stream()

    ref = db.collection_group(u'review')\
        .where(u'reviewed', u'==', True).order_by('user_id')
    docs = ref.stream()
    for doc in docs:
        data1.append(doc.to_dict())
    # print(type(data), data)
    df = pd.DataFrame(data1)
    df1 = df[['user_id', 'book_id', 'rating']]
    # print(df1)

    ratingss = pd.read_csv('dataset/ratings.csv',
                           usecols=['user_id', 'book_id', 'rating'])
    # print(ratings.head(5))
    ratings = pd.concat([ratingss, df1])
    # print(new)
    ratings['user_id'] = ratings['user_id'].apply(str)

    n_users = ratings.user_id.unique().shape[0]
    n_books = ratings.book_id.unique().shape[0]
    print('Number of users = ' + str(n_users) + ' | Number of books = ' +
          str(n_books))

    Ratings = ratings.pivot(index='user_id',
                            columns='book_id',
                            values='rating').fillna(0)
    # Ratings.head()

    R = Ratings.to_numpy()
    user_ratings_mean = np.mean(R, axis=1)
    Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

    sparsity = round(1.0 - len(ratings) / float(n_users * n_books), 3)
    # print ('The sparsity level of MovieLens1M dataset is ' +  str(sparsity * 100) + '%')

    U, sigma, Vt = svds(Ratings_demeaned, k=50)

    sigma = np.diag(sigma)
    # sigma

    all_user_predicted_ratings = np.dot(np.dot(U, sigma),
                                        Vt) + user_ratings_mean.reshape(-1, 1)

    preds = pd.DataFrame(all_user_predicted_ratings, columns=Ratings.columns)

    reader = Reader()

    # Load ratings dataset with Dataset library
    data = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']],
                                reader)

    # Split the dataset for 5-fold evaluation
    kf = KFold(n_splits=5)
    svd = SVD()

    for trainset, testset in kf.split(data):

        # train and test algorithm.
        svd.fit(trainset)
        predictions = svd.test(testset)

        # Compute and print Root Mean Squared Error
        accuracy.rmse(predictions, verbose=True)

    trainset = data.build_full_trainset()
    svd.fit(trainset)
    print('Data Trained Successfully')
    with open('model_pickle', 'wb') as f:
        pickle.dump(svd, f)
Exemplo n.º 49
0
for i in range(0, rating_train_matrix.shape[0]):
    for j in range(0, rating_train_matrix.shape[1]):
        if rating_train_matrix[i][j] == 0:
            mean_u_ratings[i][j] = 0

print(mean_u_ratings)

# In[11]:

### reference: https://github.com/khanhnamle1994/movielens/blob/master/SVD_Model.ipynb

## use scipy function to do the singular value decomposition

from scipy.sparse.linalg import svds
U, sigma, Vt = svds(mean_u_ratings, k=200)

sigma = np.diag(sigma)
print(sigma)

# In[12]:

### Reference:  https://simplyml.com/generating-recommendations/

## predict the ratings by multiplying the three matrices U, sigma and Vt

predict_rating = np.dot(np.dot(U, sigma), Vt) + mean_user_ratings.reshape(
    -1, 1)
print(predict_rating)
print(predict_rating.shape)
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

    #prediction = prediction[ground_truth.nonzero()].flatten()


print('basado en similaridad CF RMSE: ' + str(rmse(user_prediction, R)))

#normalizado los valores y modificado columnas e indices

R = R_df.as_matrix()
user_ratings_mean = np.mean(R, axis=1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R, k=50)

sigma = np.diag(sigma)

all_user_predicted_ratings = np.dot(np.dot(U, sigma),
                                    Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings, columns=R_df.columns)

from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k=50)

normalized = (preds_df - min(preds_df)) / (max(preds_df) - min(preds_df))

from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 5))
Exemplo n.º 51
0
csvt = pd.read_csv('go_track_trackspoints.csv')

tracks = csvt[['track_id', 'latitude', 'longitude']]

#carrega o csv gerado por update.py
R_df = pd.read_csv('matrix.csv', header=0, index_col=0)
#Transforma o dataframe do pandas em uma matriz numpy para se realizar os cálculos e a normalização
R = R_df.as_matrix().astype(np.int64)

user_ratings_mean = np.mean(R, axis=1)

R_demeaned = R - user_ratings_mean.reshape(-1, 1)

from scipy.sparse.linalg import svds

U, sigma, Vt = svds(R_demeaned, k=25)

sigma = np.diag(sigma)

all_user_predicted_ratings = np.dot(np.dot(U, sigma),
                                    Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_user_predicted_ratings,
                        columns=R_df.columns.astype(np.int64))


def recommend_tracks(predictions_df,
                     userID,
                     tracks,
                     original_ratings_df,
                     num_recommendations=5):
Exemplo n.º 52
0
def report():
    import numpy as np  # linear algebra
    import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
    from subprocess import check_output

    import os
    print(os.listdir('../input'))
    diff = pd.read_csv('../input/diffsydiw.csv')
    sym = pd.read_csv('../input/sym_t.csv')
    dia = pd.read_csv('../input/dia_t.csv')
    # print(sym.head())
    # dia['idnr'] = dia['_id'].convert_objects(convert_numeric=True)
    # print(dia.head())
    sd_diff = diff.merge(sym, left_on='syd', right_on='syd')
    #print(sd_diff.head())
    sd_diff = sd_diff.merge(dia, left_on='did', right_on='did')
    # print(sd_diff.head())

    from sklearn.preprocessing import LabelEncoder
    from tqdm import tqdm

    from scipy.sparse import coo_matrix, csr_matrix

    def read_data(filename):
        """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
        and a sparse matrix of song/user/playcount """
        # read in triples of user/song/playcount from the input dataset
        data = pd.read_csv(
            filename,
            usecols=[
                0, 1, 2
            ],  # [36, 11, 10] vrk_pat_primkey,prd_atc_primkey,vdp_aantal
            names=['user', 'song', 'plays'],
            skiprows=1
        )  # [:1000000]   # user = patient, or prescriptionnr song=atc

        data = data.dropna(axis=0, how='any')  # drop nan
        data['plays'] = data['plays'] + 1
        # print(data.head())
        # map each song and user to a unique numeric value
        data['user'] = data['user'].astype("category")
        data['song'] = data['song'].astype("category")

        # create a sparse matrix of all the users/plays
        plays = coo_matrix(
            (data['plays'].astype(float), (data['song'].cat.codes.copy(),
                                           data['user'].cat.codes.copy())))

        return data, plays, data.groupby(
            ['song']).plays.sum(), data['user'].cat.codes.copy()

    data, matrix, songsd, user = read_data('../input/diffsydiw.csv')
    data.head(10)

    # user=symptom
    # sond=diagnose

    from sklearn.preprocessing import normalize

    def cosine(plays):
        normalized = normalize(plays)
        return normalized.dot(normalized.T)

    def bhattacharya(plays):
        plays.data = np.sqrt(plays.data)
        return cosine(plays)

    def ochiai(plays):
        plays = csr_matrix(plays)
        plays.data = np.ones(len(plays.data))
        return cosine(plays)

    def bm25_weight(data, K1=1.2, B=0.8):
        """ Weighs each row of the matrix data by BM25 weighting """
        # calculate idf per term (user)
        N = float(data.shape[0])
        idf = np.log(N / (1 + np.bincount(data.col)))

        # calculate length_norm per document (artist)
        row_sums = np.squeeze(np.asarray(data.sum(1)))
        average_length = row_sums.sum() / N
        length_norm = (1.0 - B) + B * row_sums / average_length

        # weight matrix rows by bm25
        ret = coo_matrix(data)
        ret.data = ret.data * (K1 + 1.0) / (K1 * length_norm[ret.row] +
                                            ret.data) * idf[ret.col]
        return ret

    def bm25(plays):
        plays = bm25_weight(plays)
        return plays.dot(plays.T)

    def get_largest(row, N=10):
        if N >= row.nnz:
            best = zip(row.data, row.indices)
        else:
            ind = np.argpartition(row.data, -N)[-N:]
            best = zip(row.data[ind], row.indices[ind])
        return sorted(best, reverse=True)

    def calculate_similar_artists(similarity, artists, artistid):
        neighbours = similarity[artistid]
        top = get_largest(neighbours)
        return [(artists[other], score, i)
                for i, (score, other) in enumerate(top)]

    # songsd = dict(enumerate(data['song'].cat.categories))
    user_count = data.groupby('user').size()
    # to_generate = sorted(list(songsd), key=lambda x: -user_count[x])

    similarity = bm25_weight(matrix)

    # print(sym)
    sym[sym['syd'].isin(list(songsd.index))]

    from scipy.sparse.linalg import svds

    Ur, Si, VTr = svds(bm25_weight(coo_matrix(matrix)), k=100)
    # print(Ur.shape, Si.shape, VTr.shape,user.shape,matrix.shape,data.shape,songsd.shape,user_count.shape)
    VTr = pd.DataFrame(VTr)

    from sklearn.metrics.pairwise import cosine_similarity
    Sddf = pd.DataFrame(cosine_similarity(Ur, VTr.T),
                        columns=user_count.index,
                        index=list(songsd.index))
    Sddf.to_csv('Sddf.csv')

    Sydi = pd.DataFrame(cosine_similarity(Ur, VTr.T))

    ###changes

    # booknr=13 #symptoom4
    # b='Headache'
    # print('Symptom',sym[sym['symptom']==b])

    # a = input("Enter your symptom:")
    file = open("Symptom.txt", "r")
    x = file.readlines()
    file.close()
    print(x)
    r1 = []
    for j in range(len(x)):
        a = x[j]
        #print("symptom")
        #print(x[0])
        #print(a)
        data = pd.read_csv("sym_t.csv")
        c = 0
        for i in data['symptom']:
            print(c)
            c += 1
            print(type(i))
            #print("i is"+i)
            if a == (i + "\n"):
                break

        print("Value of c")
        print(c)
        booknr = c  # symptoom4
        print('Symptom', sym[sym['syd'] == booknr])
        print('top 7 related disease probability'
              )  # ,Sddf[booknr].sort_values(ascending=False))
        print()

        # print(sym.loc["symptom"]=="Headache")

        data = pd.read_csv("sym_t.csv", index_col="symptom")
        # print(data.loc["Headache"], ["syd"])
        #print("hi")
        lijst = Sddf[booknr].sort_values(ascending=False).index
        for xi in lijst[:4]:
            r1.append(dia[dia['did'] == xi].diagnose.values)
    #print(type(r1[0][0]))
    file = open("Disease.txt", "a")
    for j in range(len(r1)):
        file.write(r1[j][0])
        file.write("\n")
    file.close()

    #print(r1[0][0])
    '''lijst = list(lijst[:3])
Exemplo n.º 53
0
def Demo():

    #__SUPPLY_CHAIN_NETWORK__###################################################
    N = 10  # number of possible maps
    T = 1000  # number of time steps
    eta = .01  # learning rate

    # Define Domains and Compute Equilbria
    Domains = []
    X_Stars = []
    CurlBounds = []
    for n in range(N):
        # Create Domain
        Network = CreateRandomNetwork(I=3, Nm=2, Nd=2, Nr=1, seed=n)
        Domain = SupplyChain(Network=Network, alpha=2)

        # Record Domain
        Domains += [Domain]

        # Set Method
        Method = HeunEuler(Domain=Domain, P=BoxProjection(lo=0), Delta0=1e-3)

        # Initialize Starting Point
        Start = np.zeros(Domain.Dim)

        # Calculate Initial Gap
        gap_0 = Domain.gap_rplus(Start)

        # Calculate Curl Bound
        J = approx_jacobian(Domain.F, Start)
        _J = approx_jacobian(Domain.F, Start + 0.5)
        assert np.allclose(J, _J, atol=1e-5)
        CurlBounds += [
            np.sqrt(18) *
            svds(J, k=1, which='LM', return_singular_vectors=False).item()
        ]

        # Set Options
        Init = Initialization(Step=-1e-10)
        Term = Termination(MaxIter=25000,
                           Tols=[(Domain.gap_rplus, 1e-3 * gap_0)])
        Repo = Reporting(Requests=[Domain.gap_rplus])
        Misc = Miscellaneous()
        Options = DescentOptions(Init, Term, Repo, Misc)

        # Print Stats
        PrintSimStats(Domain, Method, Options)

        # Start Solver
        tic = time.time()
        SupplyChain_Results = Solve(Start, Method, Domain, Options)
        toc = time.time() - tic

        # Print Results
        PrintSimResults(Options, SupplyChain_Results, Method, toc)

        # Record X_Star
        X_Star = SupplyChain_Results.TempStorage['Data'][-1]
        X_Stars += [X_Star]
    X_Stars = np.asarray(X_Stars)

    # Compute Equilibrium of Average Domain
    Domain = AverageDomains(Domains)

    # Set Method
    Method = HeunEuler(Domain=Domain, P=BoxProjection(lo=0), Delta0=1e-3)

    # Initialize Starting Point
    Start = np.zeros(Domain.Dim)

    # Calculate Initial Gap
    gap_0 = Domain.gap_rplus(Start)

    # Set Options
    Init = Initialization(Step=-1e-10)
    Term = Termination(MaxIter=25000, Tols=[(Domain.gap_rplus, 1e-3 * gap_0)])
    Repo = Reporting(Requests=[Domain.gap_rplus])
    Misc = Miscellaneous()
    Options = DescentOptions(Init, Term, Repo, Misc)

    # Print Stats
    PrintSimStats(Domain, Method, Options)

    # Start Solver
    tic = time.time()
    SupplyChain_Results = Solve(Start, Method, Domain, Options)
    toc = time.time() - tic

    # Print Results
    PrintSimResults(Options, SupplyChain_Results, Method, toc)

    # Record X_Opt
    # X_Opt = SupplyChain_Results.TempStorage['Data'][-1]
    X_Opt = np.mean(X_Stars, axis=0)

    print('Starting Online Learning')

    # Set First Prediction
    X = np.zeros(X_Stars.shape[1])

    # Select First Domain
    # idx = np.argmax(np.linalg.norm(X_Stars - X,axis=1))
    idx = 0

    distances = []
    loss_infs = []
    regret_standards = []
    regret_news = []
    stokes = []
    ts = range(T)
    for t in ts:
        print('t = ' + str(t))
        # retrieve domain
        Domain = Domains[idx]
        # retrieve equilibrium / reference vector
        equi = X_Stars[idx]
        # calculate distance
        distances += [np.linalg.norm(equi - X)]
        # calculate infinity loss
        loss_infs += [infinity_loss(Domain, X)]
        # calculate standard regret
        ci_predict = ContourIntegral(Domain, LineContour(equi, X))
        predict_loss = integral(ci_predict)
        ci_opt = ContourIntegral(Domain, LineContour(equi, X_Opt))
        predict_opt = integral(ci_opt)
        regret_standards += [predict_loss - predict_opt]
        # calculate new regret
        ci_new = ContourIntegral(Domain, LineContour(X_Opt, X))
        regret_news += [integral(ci_new)]
        # calculate bound
        # area = 0.5*np.prod(np.sort([np.linalg.norm(X_Opt-equi),np.linalg.norm(X-X_Opt),np.linalg.norm(equi-X)])[:2])  # area upper bound
        area = herons(X_Opt, X, equi)  # exact area
        stokes += [CurlBounds[idx] * area]
        # update prediction
        X = BoxProjection(lo=0).P(X, -eta, Domain.F(X))
        # update domain
        # idx = np.argmax(np.linalg.norm(X_Stars - X,axis=1))
        idx = (idx + 1) % X_Stars.shape[0]

    ts_p1 = range(1, T + 1)
    distances_avg = np.divide(distances, ts_p1)
    loss_infs_avg = np.divide(loss_infs, ts_p1)
    regret_standards_avg = np.divide(regret_standards, ts_p1)
    regret_news_avg = np.divide(regret_news, ts_p1)
    stokes = np.asarray(stokes)

    np.savez_compressed('NoRegret_SCN.npz',
                        d_avg=distances_avg,
                        linf_avg=loss_infs_avg,
                        rs_avg=regret_standards_avg,
                        rn_avg=regret_news_avg,
                        stokes=stokes)

    # plt.subplot(2, 1, 1)
    # plt.plot(ts, distances_avg, 'k',label='Average Distance')
    # plt.title('Demonstration of No-Regret on MLN')
    # plt.ylabel('Euclidean Distance')
    # plt.legend()

    plt.subplot(1, 1, 1)
    plt.plot(ts, loss_infs_avg, 'k--', label=r'loss$_{\infty}$')
    plt.plot(ts,
             regret_standards_avg,
             'r--o',
             markevery=T // 20,
             label=r'regret$_{s}$')
    plt.plot(ts, regret_news_avg, 'b-', label=r'regret$_{n}$')
    ax.fill_between(ts,
                    regret_news_avg - stokes,
                    regret_news_avg + stokes,
                    facecolor='c',
                    alpha=0.2,
                    zorder=0,
                    label='Stokes Bound')
    plt.plot(ts, np.zeros_like(ts), 'w-', lw=1)
    plt.xlabel('Time Step')
    plt.ylabel('Aggregate System-Wide Loss')
    plt.xlim([0, T])
    plt.ylim([-250, 1000])
    plt.legend()
    plt.title('Demonstration of No-Regret on Supply Chain Network')

    plt.savefig('NoRegret_SCN')
Exemplo n.º 54
0
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize

from flask import Flask, request

with open("./app/helper_functions/songData.json", "r") as f:
    song_transcripts = json.load(f)

print("loading this page")

SONGS = [song_transcripts[index] for index in song_transcripts]
songlist = [song["lyrics"] for song in SONGS]
vectorizer = TfidfVectorizer(stop_words="english", max_df=.8)
docs_compressed = svds(vectorizer.fit_transform(songlist).transpose(),
                       k=40)[2].transpose()
tokenize_transcript(SONGS)
inv_idx = build_inverted_index(SONGS)
idf = compute_idf(inv_idx, len(SONGS))
doc_norms = computer_doc_norms(inv_idx, idf, len(SONGS))


@irsystem.route('search', methods=["POST"])
def getQuery():
    #docs_compressed = docs_compressed.transpose()

    my_json = request.get_json()
    inputquery = my_json.get('search').lower()
    returnquery = runQuery(inputquery.lower(), inv_idx, idf, SONGS, doc_norms,
                           docs_compressed)
    data = []
Exemplo n.º 55
0
def rankOneMatrixPursuit_econ(Y,
                              projMat,
                              solRank,
                              C,
                              projMatC,
                              biasMat,
                              verbose=False):

    M = list()

    rec_X = np.zeros(
        (projMat.shape[0],
         projMat.shape[1]))  # matrix reconstructed using basis matrices

    projMat_c = sp.coo_matrix(projMat)
    row_coo, col_coo, dummy = sp.find(projMat_c != 0)
    proj_coo = zip(row_coo, col_coo)

    # projection
    Y_proj = np.multiply(Y.copy(), projMat)

    # get observed entries as a list for solving linear equations to
    # estimate coefficients
    Yp = np.transpose(np.matrix([Y_proj[el] for el in proj_coo]))

    u_set = list()
    v_set = list()

    rmse_list = list()

    count = -1

    for maxIter in range(solRank):

        count = count + 1

        # residual
        Res_k = np.matrix(Y_proj - rec_X)

        # left and right highest singular vectors of the residual matrix
        Res_k_sp = sp.bsr_matrix(Res_k)
        [u, s, vt] = spl.svds(Res_k_sp, k=1, which='LM')

        # rank-1 basis matrix
        Mk = u * vt
        #u*np.transpose(v)
        proj_basis_set = list()

        # rec_X is 0, when count==0
        if count > 0:
            proj_basis_set.append(np.multiply(rec_X, projMat))

        proj_basis_set.append(np.multiply(Mk, projMat))

        # coordinate wise content of rec_X
        rec_Xp = np.matrix([rec_X[el] for el in proj_coo])
        rec_Xp = np.transpose(rec_Xp)

        Mp = np.matrix([Mk[el] for el in proj_coo])
        Mp = np.transpose(Mp)

        if count > 0:
            M = rec_Xp
            M = np.hstack([M, Mp])
        else:
            M = Mp

        if (count == 0):
            u_set = np.matrix(np.transpose(u))
            v_set = np.matrix(vt)
        else:
            u_set = np.vstack([u_set, np.transpose(u)])
            v_set = np.vstack([v_set, vt])

        #print 'solving for coefficients ...'
        # len(alpha_k) == 2
        alpha_k = solve(M, Yp)

        # update coefficients
        if count == 0:
            theta_k = alpha_k
        elif count == 1:
            theta_k[0] = theta_k[0] * alpha_k[0]
            theta_k.extend([alpha_k[1]])
        else:
            for c in range(maxIter - 1):
                theta_k[c] = theta_k[c] * alpha_k[0]

            theta_k.extend([alpha_k[1]])

        # reconstruction of the projected matrix
        rec_X = reconstruct_proj(proj_basis_set, alpha_k)
        if verbose:
            #print 'res norm: ' + str(np.linalg.norm(rec_X,'fro'))
            print "iter:" + str(maxIter) + " err: " + str(
                np.linalg.norm(rec_X - Y_proj, 'fro'))

        #print "coeffs: " + str(theta_k)

        Z_inter = reconstruct_full(u_set, v_set, theta_k, biasMat)

        if verbose:
            err = np.linalg.norm(np.multiply(projMatC - projMat, Z_inter - C),
                                 'fro') / np.sqrt(np.sum(projMatC - projMat))
            print "rmse full: " + str(err)
            rmse_list.append(err)

    # full reconstruction
    Z_final = reconstruct_full(u_set, v_set, theta_k, biasMat)

    return [Z_final, rmse_list]
Exemplo n.º 56
0
    row_indices = []
    col_indices = []
    data_rating = []

    lines = netflix_file.collect()
    for line in lines:
        line_array = line.split(",")
        row_indices.append(int(line_array[0]) - 1)
        col_indices.append(int(line_array[1]) - 1)
        data_rating.append(float(line_array[2]))
    return csr_matrix((data_rating, (row_indices, col_indices)))


if __name__ == "__main__":
    # made the spark contest
    sc = SparkContext(appName="SVD Solver for Netflix Data")
    # input file
    netflix_file = sc.textFile("nf_subsample.csv")
    sparse_data = CSV_to_sparse(netflix_file)
    # k = 20 principal components
    U, s, Vt = svds(sparse_data, 20)
    # 20, to 20 * 20 to get reconstruction error
    final_s = np.diag(s)

    matrix_after_svd = U.dot((final_s.dot(Vt)))
    nz_index = sparse_data.nonzero()
    # original minus the reconstructed one
    difference = np.asarray(sparse_data[nz_index] - matrix_after_svd[nz_index])
    # recconstruction error
    loss_l2 = np.sum(difference**2)
    print(loss_l2)
Exemplo n.º 57
0
def rankOneMatrixPursuit(Y,
                         projMat,
                         solRank,
                         C,
                         projMatC,
                         biasMat,
                         verbose=False):

    M = list()

    rec_X = 0  # matrix reconstructed using basis matrices

    projMat_c = sp.coo_matrix(projMat)
    row_coo, col_coo, dummy = sp.find(projMat_c != 0)
    proj_coo = zip(row_coo, col_coo)
    proj_basis_set = list()

    # projection
    Y_proj = np.multiply(Y, projMat)

    # get observed entries as a list for solving linear equations to
    # estimate coefficients
    Yp = np.transpose(np.matrix([Y_proj[el] for el in proj_coo]))

    # for rank-1 matrices of the form M = u*v', storing u and v are sufficient
    u_set = list()
    v_set = list()

    rmse_list = list()

    for maxIter in range(solRank):

        # residual
        Res_k = np.matrix(Y_proj - rec_X)

        # get left and right highest singular vectors of residual
        Res_k = sp.bsr_matrix(Res_k)
        [u, s, vt] = spl.svds(Res_k, k=1, which='LM')

        # get the rank-1 basis matrix
        Mk = u * vt
        proj_basis_set.append(np.multiply(
            Mk, projMat))  # projection on observed entries

        Mp = np.matrix([Mk[el] for el in proj_coo])
        Mp = np.transpose(
            Mp)  # get the entries for solving the linear-equations

        if (len(M) == 0):
            M = Mp
            u_set = np.matrix(np.transpose(u))
            v_set = np.matrix(vt)

        else:
            M = np.hstack([M, Mp])
            u_set = np.vstack([u_set, np.transpose(u)])
            v_set = np.vstack([v_set, vt])

        # solving for coefficients of the basis matrices
        theta_k = solve(M, Yp)

        # performing reconstruction on the projected set
        rec_X = reconstruct_proj(proj_basis_set, theta_k)

        if verbose:
            #print 'res norm: ' + str(np.linalg.norm(rec_X,'fro'))
            print "iter:" + str(maxIter) + " err: " + str(
                np.linalg.norm(rec_X - Y_proj, 'fro'))

        #print "coeffs: " + str(theta_k)
        if verbose:
            Z_inter = reconstruct_full(u_set, v_set, theta_k, biasMat)
            err = np.linalg.norm(np.multiply(projMatC - projMat, Z_inter - C),
                                 'fro') / np.sqrt(np.sum(projMatC - projMat))
            print "rmse full: " + str(err)
            rmse_list.append(err)

    print theta_k
    Z_final = reconstruct_full(u_set, v_set, theta_k, biasMat)

    return [Z_final, rmse_list]
Exemplo n.º 58
0
def svd_wrapper(matrix, mode, ncomp, debug, verbose, usv=False):
    """ Wrapper for different SVD libraries with the option of showing the 
    cumulative explained variance ratio.
    
    Note:
    ----
    Sklearn.PCA deprecated as it uses linalg.svd(X, full_matrices=False) under 
    the hood, which is already included.
    Sklearn.RandomizedPCA deprecated as it uses sklearn.randomized_svd which is
    already included.
    
    """
    if not matrix.ndim==2:
        raise TypeError('Input matrix is not a 2d array')
    
    def reconstruction(ncomp, U, S, V, var=1): 
        if mode=='lapack':
            rec_matrix = np.dot(U[:,:ncomp], np.dot(np.diag(S[:ncomp]), V[:ncomp]))
            rec_matrix = rec_matrix.T
            print('  Matrix reconstruction with {:} PCs:'.format(ncomp))
            print('  Mean Absolute Error =', mean_absolute_error(matrix, 
                                                                 rec_matrix))
            print('  Mean Squared Error =', mean_squared_error(matrix,rec_matrix))
            
            exp_var = S**2
            full_var = np.sum(S**2)
            explained_variance_ratio = exp_var / full_var        # % of variance explained by each PC
            ratio_cumsum = np.cumsum(explained_variance_ratio)
        elif mode=='eigen':
            exp_var = S**2                                       # squared because we previously took the sqrt of the EVals
            full_var = np.sum(S**2)
            explained_variance_ratio = exp_var / full_var        # % of variance explained by each PC
            ratio_cumsum = np.cumsum(explained_variance_ratio)
        else:
            rec_matrix = np.dot(U, np.dot(np.diag(S), V))
            print('  Matrix reconstruction MAE =', mean_absolute_error(matrix, 
                                                                       rec_matrix))
            exp_var = (S**2) / matrix.shape[0]
            full_var = np.var(matrix, axis=0).sum()
            explained_variance_ratio = exp_var / full_var        # % of variance explained by each PC       
            if var==1:  pass    
            else:  explained_variance_ratio = explained_variance_ratio[::-1]
            ratio_cumsum = np.cumsum(explained_variance_ratio)
            msg = '  This info makes sense when the matrix is mean centered '
            msg += '(temp-mean scaling)'
            print (msg)
        
        lw = 2
        alpha = 0.4
        fig = plt.figure(figsize=(6,3))
        fig.subplots_adjust(wspace=0.4)
        ax1 = plt.subplot2grid((1,3), (0,0), colspan=2)
        ax1.step(list(range(explained_variance_ratio.shape[0])), 
                 explained_variance_ratio, alpha=alpha, where='mid', 
                 label='Individual EVR', lw=lw)
        ax1.plot(ratio_cumsum, '.-', alpha=alpha, 
                 label='Cumulative EVR', lw=lw)
        ax1.legend(loc='best', frameon=False, fontsize='medium')
        ax1.set_ylabel('Explained variance ratio (EVR)')
        ax1.set_xlabel('Principal components')
        ax1.grid(linestyle='solid', alpha=0.2)
        ax1.set_xlim(-10, explained_variance_ratio.shape[0]+10)
        ax1.set_ylim(0, 1)
        
        trunc = 20
        ax2 = plt.subplot2grid((1,3), (0,2), colspan=1)
        #plt.setp(ax2.get_yticklabels(), visible=False)
        ax2.step(list(range(trunc)), explained_variance_ratio[:trunc], alpha=alpha, 
                 where='mid', lw=lw)
        ax2.plot(ratio_cumsum[:trunc], '.-', alpha=alpha, lw=lw)
        ax2.set_xlabel('Principal components')
        ax2.grid(linestyle='solid', alpha=0.2)
        ax2.set_xlim(-2, trunc+2)
        ax2.set_ylim(0, 1)
        
        msg = '  Cumulative explained variance ratio for {:} PCs = {:.5f}'
        #plt.savefig('figure.pdf', dpi=300, bbox_inches='tight')
        print(msg.format(ncomp, ratio_cumsum[ncomp-1]))
        
    if ncomp>min(matrix.shape[0],matrix.shape[1]):
        msg = '{:} PCs can be obtained from a matrix with size [{:},{:}].'
        msg += ' Increase the size of the patches or decrease the number of'
        msg += ' principal components.'
        raise RuntimeError(msg.format(ncomp, matrix.shape[0], matrix.shape[1]))
        
    if mode=='eigen':
        # in our data n_frames is always smaller than n_pixels. In this setting
        # by taking the covariance as np.dot(matrix.T,matrix) we get all 
        # (n_pixels) eigenvectors but it is much slower and takes more memory 
        M = np.dot(matrix, matrix.T)                             # covariance matrix
        e, EV = linalg.eigh(M)                                   # eigenvalues and eigenvectors
        pc = np.dot(EV.T, matrix)                                # PCs using a compact trick when cov is MM'
        V = pc[::-1]                                             # reverse since last eigenvectors are the ones we want 
        S = np.sqrt(e)[::-1]                                     # reverse since eigenvalues are in increasing order 
        if debug: reconstruction(ncomp, None, S, None)
        for i in range(V.shape[1]):
            V[:,i] /= S                                          # scaling by the square root of eigenvalues
        V = V[:ncomp]
        if verbose: print('Done PCA with numpy linalg eigh functions')
        
    elif mode=='lapack':
        # in our data n_frames is always smaller than n_pixels. In this setting
        # taking the SVD of M' and keeping the left (transposed) SVs is faster
        # than taking the SVD of M and taking the right ones
        U, S, V = linalg.svd(matrix.T, full_matrices=False)         
        if debug: reconstruction(ncomp, U, S, V)
        V = V[:ncomp]                                           # we cut projection matrix according to the # of PCs               
        U = U[:,:ncomp]
        S = S[:ncomp]
        if verbose: print('Done SVD/PCA with numpy SVD (LAPACK)')
            
    elif mode=='arpack':
        U, S, V = svds(matrix, k=ncomp) 
        if debug: reconstruction(ncomp, U, S, V, -1)
        if verbose: print('Done SVD/PCA with scipy sparse SVD (ARPACK)')

    elif mode=='randsvd':
        U, S, V = randomized_svd(matrix, n_components=ncomp, n_iter=2, 
                                 transpose='auto', random_state=None)
        if debug: reconstruction(ncomp, U, S, V)
        if verbose: print('Done SVD/PCA with randomized SVD')

    else:
        raise TypeError('The SVD mode is not available')
            
    if usv:
        if mode=='lapack':
            return U.T, S, V.T
        else:
            return U, S, V
    else:
        if mode=='lapack':
            return U.T
        else:
            return V
Exemplo n.º 59
0
 def err(tol):
     _, s2, _ = svds(A, k=k, v0=np.ones(n), solver=self.solver, tol=tol)
     return np.linalg.norm((s2 - s[k - 1::-1]) / s[k - 1::-1])
Exemplo n.º 60
0
    def _fit_truncated(self, X, n_components, svd_solver):
        """Fit the model by computing truncated SVD (by ARPACK or randomized)
        on X
        """
        n_samples, n_features = X.shape

        if isinstance(n_components, six.string_types):
            raise ValueError("n_components=%r cannot be a string "
                             "with svd_solver='%s'"
                             % (n_components, svd_solver))
        elif not 1 <= n_components <= min(n_samples, n_features):
            raise ValueError("n_components=%r must be between 1 and "
                             "min(n_samples, n_features)=%r with "
                             "svd_solver='%s'"
                             % (n_components, min(n_samples, n_features),
                                svd_solver))
        elif not isinstance(n_components, (numbers.Integral, np.integer)):
            raise ValueError("n_components=%r must be of type int "
                             "when greater than or equal to 1, was of type=%r"
                             % (n_components, type(n_components)))
        elif svd_solver == 'arpack' and n_components == min(n_samples,
                                                            n_features):
            raise ValueError("n_components=%r must be strictly less than "
                             "min(n_samples, n_features)=%r with "
                             "svd_solver='%s'"
                             % (n_components, min(n_samples, n_features),
                                svd_solver))

        random_state = check_random_state(self.random_state)

        # Center data
        self.mean_ = np.mean(X, axis=0)
        X -= self.mean_

        if svd_solver == 'arpack':
            # random init solution, as ARPACK does it internally
            v0 = random_state.uniform(-1, 1, size=min(X.shape))
            U, S, V = svds(X, k=n_components, tol=self.tol, v0=v0)
            # svds doesn't abide by scipy.linalg.svd/randomized_svd
            # conventions, so reverse its outputs.
            S = S[::-1]
            # flip eigenvectors' sign to enforce deterministic output
            U, V = svd_flip(U[:, ::-1], V[::-1])

        elif svd_solver == 'randomized':
            # sign flipping is done inside
            U, S, V = randomized_svd(X, n_components=n_components,
                                     n_iter=self.iterated_power,
                                     flip_sign=True,
                                     random_state=random_state)

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = V
        self.n_components_ = n_components

        # Get variance explained by singular values
        self.explained_variance_ = (S ** 2) / (n_samples - 1)
        total_var = np.var(X, ddof=1, axis=0)
        self.explained_variance_ratio_ = \
            self.explained_variance_ / total_var.sum()
        self.singular_values_ = S.copy()  # Store the singular values.

        if self.n_components_ < min(n_features, n_samples):
            self.noise_variance_ = (total_var.sum() -
                                    self.explained_variance_.sum())
            self.noise_variance_ /= min(n_features, n_samples) - n_components
        else:
            self.noise_variance_ = 0.

        return U, S, V