def pseudoinverse(Mat, precision): """ Pseudoinverse computation. Objective: ---------- To compute pseudoinverse using Singular Value Depcomposition Reason: ------- SVD using Scipy is slow and consumes a lot of memory, similarly pysparse matrix consumes a lot of memory. This is a better alternative to a direct computation of inverse. Process: -------- The function uses sparsesvd to compute the SVD of a sparse matrix, there is a precision attached in the function, this controls the cutting (or the k) of the SVD. Precision is actually a percentage and uses this to get the k. k = (Precision/100) * rows of the matrix. The function takes a sparse matrix and a precision score as the input. """ matrix = Mat.tocsc() if matrix.shape[0] <= matrix.shape[1]: k = int((precision * matrix.shape[0]) / 100) ut, s, vt = sparsesvd(matrix.tocsc(), k) UT = ss.csr_matrix(ut) SI = ss.csr_matrix(np.diag(1 / s)) VT = ss.csr_matrix(vt) temp_matrix = spmatrixmul(VT.transpose(), SI) pinv_matrix = spmatrixmul(temp_matrix, UT) del ut, s, vt, UT, SI, VT, temp_matrix else: k = int((precision * matrix.transpose().shape[0]) / 100) ut, s, vt = sparsesvd(matrix.transpose().tocsc(), k) UT = ss.csr_matrix(ut) SI = ss.csr_matrix(np.diag(1 / s)) VT = ss.csr_matrix(vt) temp_matrix = spmatrixmul(UT.transpose(), SI) pinv_matrix = spmatrixmul(temp_matrix, VT) del ut, s, vt, UT, SI, VT, temp_matrix return pinv_matrix.tocsr()
def tune(my_corpus, dictionary, min_topics=2,max_topics=50,step=2): def sym_kl(p,q): return np.sum([scipy.stats.entropy(p,q),scipy.stats.entropy(q,p)]) kl = [] Hbar = [] perplexity = [] n_topics = [] l = np.array([sum(cnt for _, cnt in doc) for doc in my_corpus]) corpus = Index.get_corpus('train features') for i in range(min_topics,max_topics,step): n_topics.append(i) lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,num_topics=i, alpha = 'auto') m1 = scipy.sparse.csc_matrix(lda.expElogbeta) U,cm1,V = sparsesvd(m1, m1.shape[0]) #Document-topic matrix lda_topics = lda[my_corpus] m2 = gensim.matutils.corpus2dense(lda_topics, lda.num_topics).transpose() cm2 = l.dot(m2) cm2 = cm2 + 0.0001 cm2norm = np.linalg.norm(l) cm2 = cm2/cm2norm kl.append(sym_kl(cm1,cm2)) entropy_list = [scipy.stats.entropy([x[1] for x in lda[v]] ) for v in my_corpus] Hbar.append(np.mean(entropy_list)) perplexity.append( lda.log_perplexity(my_corpus) ) print("NumTopics: %s | Unscaled Entropy: %s | Per-word-bound: %s | Per-word-perplexity: %s | Arun measure %s" % \ (i, Hbar[-1], perplexity[-1], np.exp2(-perplexity[-1]), kl[-1])) return n_topics, Hbar, perplexity, kl
def LSA(M, k): ##will return top k sentences SM = scipy.sparse.csc_matrix(M) # convert to sparse CSC format u, s, vt = sparsesvd(SM, k + 10) # ##SVD calculated at this stage, concept matrix vt, from now we can apply various approaches ##to filter out top k sentences. ##We are using OzSoy's approach ##Using Cross Method m, n = M.shape Avg = numpy.average(M, 1) for i in range(0, m): for j in range(0, n): if M[i][j] < Avg[i]: M[i][j] = 0 Length = numpy.dot(s, vt) L = [] ##returning top k sentences for i in range(0, n): L.append(tuple([Length[i], i])) if k >= len(L): return L #building min heap count = int(k / 2 - 1) while (count >= 0): L = heapify(L, count, k) count -= 1 for i in range(k, len(L)): if L[0][0] < L[i][0]: L[0] = L[i] L = heapify(L, 0, k) return L[:k]
def generate_model(in_path, title_limit, user_limit, features, out_path): # connect to db db = pg.connect(in_path) # load scores scores = load_scores(db) db.close() print "Loaded scores" # filter insignificant titles/users, second filtering to remove empty cols/rows (mat, old_ids_1) = filter_too_small(scores, title_limit, user_limit) (mat, old_ids_2) = filter_too_small(mat.tocsc(), 1, 1) print "Filtered insignificant titles and users" # matrix is in csr format, calc row nnz averages and convert to csc averages = map(lambda x: row_nnz_average(mat,x), range(0, mat.shape[0])) mat = mat.tocsc() # build compact titleid translation tables old_ids = join_old_id_dicts(old_ids_1, old_ids_2) (title_to_document, document_to_tile) = build_title_mapping(old_ids, mat.shape[0]) # run svd print "Built additional data" (ut, s, vt) = sparsesvd(mat.tocsc(), features) print "Factorization finished" s_sqrt = numpy.diag(numpy.sqrt(s)) s_inv = numpy.diag(numpy.power(s,-1)) terms = ut.transpose().dot(s_sqrt) documents = s_sqrt.dot(s_inv).dot(ut) # dump results savemat(out_path, {"Terms": terms, "Documents": documents, "Averages": averages, "TitleMapping": title_to_document, "DocumentMapping" : document_to_tile}, oned_as='row') print "Saved generated results"
def append_lines_update_svd(m_old, m_new): if m_old.shape[1] != m_new.shape[1]: print( '\nAppend_Lines:New matrix muss has the same colums as old matrix!' ) return m, n = m_old.shape factor = min(m, n) # factor = 5 c = m_new.shape[0] - m # print("c {}, factor {}".format(c, factor)) temp = sps.csc_matrix(m_old) u, s, v = sparsesvd.sparsesvd(temp, factor) u = u.T # print("u shape {}".format(u.shape)) B = m_new[m:, :].T A = np.concatenate((np.zeros((m, c)), np.eye(c)), axis=0) S = np.diag(s) U = np.concatenate((u, np.zeros((c, u.shape[1]))), axis=0) V = v.T # print("u shape {}, A shape {}, U shape {}, V shape {}".format(u.shape, A.shape, U.shape, V.shape)) u_new, s_new, v_new = increment_svd(U, S, V, A, B) e = np.linalg.norm(m_new - np.dot(u_new, np.dot(s_new, v_new.T)), 2) print('Error is', e)
def computeSVD(urm, K): U, s, Vt = sparsesvd(urm, K) dim = (len(s), len(s)) S = np.zeros(dim, dtype=np.float32) for i in range(0, len(s)): S[i, i] = math.sqrt(s[i]) return np.transpose(U), S.dot(Vt)
def build(self): # Strictly speaking, enumeration order of Python dict's is not defined. # In other words, iterating over same dict twice may yield different results. So, we have to keep mapping # between user/item, and row/column. self.user_row_index = dict() # type: Dict[str, int] self.item_col_index = dict() # type: Dict[str, int] self.rating_matrix_demeaned = dok_matrix((len(self.user_histories), len(self.item_histories))) for col_index, (item_id, histories) in enumerate(self.item_histories.items()): self.item_col_index[item_id] = col_index avg_rating = np.mean([x.rating for x in histories]) self.item_average_rating[item_id] = avg_rating for record in histories: row_index = self.user_row_index.get(record.user_id, len(self.user_row_index)) self.user_row_index[record.user_id] = row_index self.rating_matrix_demeaned[row_index, col_index] = record.rating - avg_rating self.global_average = np.mean(list(self.item_average_rating.values())) u, s, v = sparsesvd(csc_matrix(self.rating_matrix_demeaned), self.components) row_vectors = u.T self.col_vectors = np.dot(np.diag(s), v).T self.col_vectors_inv = np.linalg.pinv(self.col_vectors.transpose()) for u, row in self.user_row_index.items(): self.user_vectors[u] = row_vectors[row] for i, col in self.item_col_index.items(): self.item_vectors[i] = self.col_vectors[col]
def main(): args = docopt(""" Usage: pmi2svd.py [options] <repres> <pmi_path> <output_path> Options: --dim NUM Dimensionality of eigenvectors [default: 500] --neg NUM Number of negative samples; subtracts its log from PMI [default: 1] --k NUM [default: 1] """) repres = args['<repres>'] pmi_path = args['<pmi_path>'] output_path = args['<output_path>'] dim = int(args['--dim']) neg = int(args['--neg']) k = int(args['--k']) if (repres == "BPMI"): explicit = BinExplicit(pmi_path, normalize=False) elif (repres == "PMI"): explicit = NoExplicit(pmi_path, normalize=False, k=k) elif (repres == "NPMI"): explicit = NegExplicit(pmi_path, normalize=False) else: explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg) ut, s, vt = sparsesvd(explicit.m.tocsc(), dim) np.save(output_path + '.ut.npy', ut) np.save(output_path + '.s.npy', s) np.save(output_path + '.vt.npy', vt) save_vocabulary(output_path + '.words.vocab', explicit.iw) save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--ppmi_file", type=str, required=True, help="Path to the counts (matrix) file.") parser.add_argument("--svd_file", type=str, required=True, help="Path to the SVD file.") parser.add_argument("--input_vocab_file", type=str, required=True, help="Path to the input vocabulary file.") parser.add_argument("--output_vocab_file", type=str, required=True, help="Path to the output vocabulary file.") parser.add_argument("--size", type=int, default=100, help="Vector size.") parser.add_argument("--normalize", action="store_true", help="If set, we factorize normalized PPMI matrix") args = parser.parse_args() print("Ppmi2svd") input_vocab, _ = load_vocabulary(args.input_vocab_file) output_vocab, _ = load_vocabulary(args.output_vocab_file) ppmi, _, _ = load_sparse(args.ppmi_file) if args.normalize: ppmi = normalize(ppmi, sparse=True) ut, s, vt = sparsesvd(ppmi.tocsc(), args.size) np.save(args.svd_file + ".ut.npy", ut) np.save(args.svd_file + ".s.npy", s) np.save(args.svd_file + ".vt.npy", vt) save_dense(args.svd_file + ".input", ut.T, input_vocab) save_dense(args.svd_file + ".output", vt.T, output_vocab) print("Ppmi2svd finished")
def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, extra_dims=P2_EXTRA_DIMS): """ Construct the (U, S) projection from a corpus `docs`. The projection can be later updated by merging it with another Projection via `self.merge()`. This is the class taking care of the 'core math'; interfacing with corpora, splitting large corpora into chunks and merging them etc. is done through the higher-level `LsiModel` class. """ self.m, self.k = m, k self.power_iters = power_iters self.extra_dims = extra_dims if docs is not None: # base case decomposition: given a job `docs`, compute its decomposition, # *in-core*. if not use_svdlibc: u, s = stochastic_svd(docs, k, chunksize=sys.maxsize, num_terms=m, power_iters=self.power_iters, extra_dims=self.extra_dims) else: try: import sparsesvd except ImportError: raise ImportError("`sparsesvd` module requested but not found; run `easy_install sparsesvd`") logger.info("computing sparse SVD of %s matrix" % str(docs.shape)) if not scipy.sparse.issparse(docs): docs = matutils.corpus2csc(docs) ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested u = ut.T del ut, vt k = clip_spectrum(s**2, self.k) self.u = u[:, :k].copy() self.s = s[:k].copy() else: self.u, self.s = None, None
def learnProjection(sourceDomain, targetDomain): """ Learn the projection matrix and store it to a file. """ h = 50 # no. of latent dimensions. print "Loading the bipartite matrix...", coocData = sio.loadmat("../work/%s-%s/DSxDI.mat" % (sourceDomain, targetDomain)) M = sp.lil_matrix(coocData['DSxDI']) (nDS, nDI) = M.shape print "Done." print "Computing the Laplacian...", D1 = sp.lil_matrix((nDS, nDS), dtype=np.float64) D2 = sp.lil_matrix((nDI, nDI), dtype=np.float64) for i in range(0, nDS): D1[i, i] = 1.0 / np.sqrt(np.sum(M[i, :].data[0])) for i in range(0, nDI): D2[i, i] = 1.0 / np.sqrt(np.sum(M[:, i].T.data[0])) B = (D1.tocsr().dot(M.tocsr())).dot(D2.tocsr()) print "Done." print "Computing SVD...", ut, s, vt = sparsesvd(B.tocsc(), h) sio.savemat("../work/%s-%s/proj.mat" % (sourceDomain, targetDomain), {'proj': ut.T}) print "Done." pass
def matrixsvd(self): svd_matrix = self.projection_matrix.tocsc() if self.svd is 'scipy': Utemp, Stemp, VTtemp = ssl.svds( svd_matrix.tocsc(), k=(int(self.projection_matrix.tocsr().shape[0] * self.precision) / 100)) UT = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) elif self.svd is 'sparsesvd': (UT, S, VT) = sparsesvd(svd_matrix, (int(svd_matrix.shape[0] * self.precision) / 100)) elif self.svd is 'fast': Utemp, Stemp, VTtemp = fast_svd(svd_matrix, ( int(self.projection_matrix.tocsr().shape[0] * self.precision) / 100)) UT = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) else: Utemp, Stemp, VTtemp = np.linalg.svd(svd_matrix.todense()) UT = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) return UT, S, VT
def decompose(self, corpus): skip_counts = bounter(size_mb=1024) word_counts = bounter(size_mb=1024) for l in corpus: wds = l.split() skips = list(skipgrams(wds, 2, 5)) skips = ["#".join(t) for t in skips] if len(wds) > 0 and len(skips) > 0: skip_counts.update(skips) word_counts.update(wds) vocabulary = list(word_counts) shift = 1 # shift 1 does nothing since log(1) == 0.0 M = count_skipgrams(skip_counts, word_counts, vocabulary, shift) # TODO: eigen something trick # singular value decomposition # U, _, V = svds(M, k=256) # U, S, V U, _, V = sparsesvd(M, 300) # add context to U word_vecs = U.T + V.T del U del V # normalize rows word_vecs_norm = word_vecs / np.sqrt( np.sum(word_vecs * word_vecs, axis=0, keepdims=True)) del word_vecs return vocabulary, word_vecs_norm
def cnt2svd(count_file, vocab_file, PPMI): with open(count_file, "r", encoding="UTF-8-sig") as src_file: text = src_file.readlines() word2index = read_vocab(vocab_file) print("length of word_dict: " + str(len(word2index))) counts = csc_matrix((len(word2index), len(word2index)), dtype="float32") tmp_counts = dok_matrix((len(word2index), len(word2index)), dtype="float32") times = 0 for i in range(len(text)): word, context, count = text[i].strip().split() tmp_counts[word2index[word], word2index[context]] = int(count) times += 1 if times == st.UPDATE_THRESHOLD: counts = counts + tmp_counts.tocsc() tmp_counts = dok_matrix((len(word2index), len(word2index)), dtype="float32") times = 0 counts = counts + tmp_counts.tocsc() #calculate e^pmi sum_r = np.array(counts.sum(axis=1))[:, 0] sum_c = np.array(counts.sum(axis=0))[0, :] sum_total = sum_c.sum() sum_r = np.reciprocal(sum_r) sum_c = np.reciprocal(sum_c) pmi = csc_matrix(counts) normalizer = dok_matrix((len(sum_r), len(sum_r))) normalizer.setdiag(sum_r) pmi = normalizer.tocsc().dot(pmi) normalizer = dok_matrix((len(sum_c), len(sum_c))) normalizer.setdiag(sum_c) pmi = pmi.dot(normalizer.tocsc()) pmi = pmi * sum_total pmi.data = np.log(pmi.data) if PPMI: pmi[pmi < 0] = 0 I = eye(pmi.shape[0], format="csc") print("start svd") start = time.time() ut, s = sparsesvd(pmi, I, st.VECTOR_LENGTH)[:2] if PPMI: for i in range(len(s)): ut[i, :] *= np.sqrt(s[i]) else: for i in range(len(s)): ut[i, :] *= s[i] print(time.time() - start) return ut.T, word2index
def LSA(M,k): ##will return top k sentences SM = scipy.sparse.csc_matrix(M) # convert to sparse CSC format u, s, vt = sparsesvd(SM,k+10) # ##SVD calculated at this stage, concept matrix vt, from now we can apply various approaches ##to filter out top k sentences. ##We are using OzSoy's approach ##Using Cross Method m,n=M.shape Avg=numpy.average(M,1) for i in range(0,m): for j in range(0,n): if M[i][j]<Avg[i]: M[i][j]=0 Length=numpy.dot(s,vt) L=[] ##returning top k sentences for i in range(0,n): L.append(tuple([Length[i],i])) if k>=len(L): return L #building min heap count= int(k/2-1) while(count>=0): L=heapify(L,count,k) count-=1 for i in range(k,len(L)): if L[0][0]<L[i][0]: L[0]=L[i] L=heapify(L,0,k) return L[:k]
def __init__(self, m, k, docs = None): """ Store (U, S) projection itself. This is the class taking care of 'core math'; interfacing with corpora, training etc is done through class LsiModel. `docs` is either a spare matrix or a corpus which, when converted to a sparse matrix, must fit comfortably into main memory. """ self.m, self.k = m, k if docs is not None: # base case decomposition: given a job `docs`, compute its decomposition # in core, algorithm 1 if utils.isCorpus(docs): docs = matutils.corpus2csc(m, docs) if m * k < 10000: # SVDLIBC gives spurious results for small matrices.. run full # LAPACK svd on them instead docs = docs.todense() logger.info("computing dense SVD of %s matrix" % str(docs.shape)) u, s, vt = numpy.linalg.svd(docs, full_matrices = False) else: try: import sparsesvd except ImportError: raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`") logger.info("computing sparse SVD of %s matrix" % str(docs.shape)) ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested u = ut.T del ut del vt k = clipSpectrum(s, self.k) self.u, self.s = u[:, :k], s[:k] else: self.u, self.s = None, None
def applySvd(self): len_row = max(self.array_row) + 1 len_col = max(self.array_col) + 1 print 'Applying SVD with ROW: ' + str(len_row) + ' and COL: ' + str( len_col) sparse_matrix = scipy.sparse.csc_matrix( (self.array_data, (self.array_row, self.array_col)), shape=(len_row, len_col)) print 'sparsed matrix' Ut, Sigma, Vt = sparsesvd(sparse_matrix, self.svd_dimension) print 'U Sigma Vt done!' sparse_matrix = array(0) print 'Mounting Matrix SVD' self.svd_matrix = numpy.dot(Ut.T, numpy.dot(numpy.diag(Sigma), Vt)) print 'Done!' print Ut.T print '\n' print Sigma print '\n' print Vt print '\n' print self.svd_matrix.T print '\n' Ut = None Sigma = None Vt = None
def calc_svd(matrix, dim, impl, impl_args): """ apply truncated SVD with several implementations truncated SVD: sparsesvd: https://pypi.org/project/sparsesvd/ scipy: https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.svd.html randomized truncated SVD: gensim: https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/lsimodel.py scikit: https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_svd.html Check out the comparision: https://github.com/jfilter/sparse-svd-benchmark """ if impl == "sparsesvd": # originally used SVD implementation ut, s, _ = sparsesvd(matrix.m.tocsc(), dim) # returns in a different format ut = ut.T if impl == "scipy": ut, s, _ = linalg.svds(matrix.m, dim) # randomized (but fast) truncated SVD if impl == "gensim": # better default arguments args = {"power_iters": 5, "extra_dims": 10, **impl_args} ut, s = stochastic_svd(matrix.m, dim, matrix.m.shape[0], **args) if impl == "scikit": ut, s, _ = randomized_svd(matrix.m, dim, **impl_args) return ut, s
def test_svd_matrix(W, WT, D, DT): Winv = ss.csr_matrix(np.linalg.pinv(W.todense())) WTinv = ss.csr_matrix(np.linalg.pinv(W.transpose().todense())) # A = np.dot(np.dot(Winv, D), WTinv) A = ((Winv * D) * WTinv) A = A.tocsc() res_dict = {} old_z = 0 for k in range(270, 280): (ut, s, vt) = sparsesvd(A, k) U = ss.csr_matrix(ut.T) S = ss.csr_matrix(np.diag(s)) V = ss.csr_matrix(vt) L = (W * U) * (S * V * WT.transpose()) z = U.shape[1] if z == old_z: break else: Res = fnorm(L, DT) res_dict[z] = Res Result = OrderedDict(sorted(res_dict.items(), key=lambda t: np.float64(t[1]))) old_z = z return Result
def load(self, dirname, svd_k = 0): """ Load the embedding and optionally perform SVD on load. If svd_k is set to 0, no SVD is performed. """ self.dirname = dirname try: self.emb.x, self.emb.y = load_svmlight_file(dirname + EMBEDDING_FILENAME) except (ValueError, IOError): return None if svd_k != 0: try: import sparsesvd import scipy.sparse X = self.emb.x.T X = scipy.sparse.csc_matrix(X) Ut, S, Vt = sparsesvd.sparsesvd(X, svd_k) self.emb.x = scipy.sparse.csr_matrix(Vt.T) except ImportError: print('Warning: Cannot perform SVD without sparsesvd module') self._loadFeatureTable() self._loadTOC() return self.emb
def sparse_svd(): svd_input = np.empty((len(request.get_json()), len(request.get_json()[0]))) for i in range(0, len(request.get_json())): svd_input[i] = request.get_json()[i] smat = scipy.sparse.csc_matrix(svd_input) u, s, vh = sparsesvd(smat, min(smat.shape)) return json.dumps(u.T.tolist())
def computeSVDpackage(urm, K): U, s, Vt = sparsesvd(urm, K) # print(U.shape) # print(U) # print(len(s)) # print(s.shape) print(s) # # print(Vt) print(Vt.shape) print(Vt) # # print(Vt.transpose()) # print(Ux - U) dim = (len(s), len(s)) S = np.zeros(dim, dtype=np.float32) for i in range(0, len(s)): # S[i, i] = mt.sqrt(s[i]) S[i, i] = s[i] U = csc_matrix(np.transpose(U), dtype=np.float32) S = csc_matrix(S, dtype=np.float32) Vt = csc_matrix(Vt, dtype=np.float32) return U, S, Vt
def main(): args = docopt(""" Usage: pmi2svd.py [options] <pmi_path> <output_path> Options: --dim NUM Dimensionality of eigenvectors [default: 500] --neg NUM Number of negative samples; subtracts its log from PMI [default: 1] """) pmi_path = args['<pmi_path>'] output_path = args['<output_path>'] dim = int(args['--dim']) neg = int(args['--neg']) explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg) start = time.time() ut, s, vt = sparsesvd(explicit.m.tocsc(), dim) print("Time elapsed for SVD: %f" % (time.time() - start)) np.save(output_path + '.ut.npy', ut) np.save(output_path + '.s.npy', s) np.save(output_path + '.vt.npy', vt) save_vocabulary(output_path + '.words.vocab', explicit.iw) save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
def main(): en_vector=ENVector() en_vector.read_freq("results/freq_en_fixed_pmi.txt") #print "Reading Pair Co-occurence" #en_vector.read_and_duplicate("results/pair_en_test.txt") en_vector.read_pair_pmi("results/pair_en_fixed_pmi.txt") en_vector.sort_by_freq() #print "Generating Label" en_vector.generate_label() # print "Generating Matrix Label" en_vector.generate_matrix_label() #print "Calculating Vector Size" en_vector.calculate_size() matrix=sp.lil_matrix((limit,limit)) for i in range(min(limit,len(en_vector.matrix_label))): for j in range((len(en_vector.matrix_label[i]))): if en_vector.matrix_label[i][j]>=limit: continue word1=en_vector.word_list[i] word2=en_vector.word_list[en_vector.matrix_label[i][j]] matrix[i,en_vector.matrix_label[i][j]]=en_vector.pair_count[(word1,word2)] smat=sp.csc_matrix(matrix) ut,s,vt=sparsesvd(smat,10) for i in range(limit): for j in range(10): print (ut[j][i]*s[j]), print
def main(): args = docopt(""" Usage: counts2svd.py [options] <counts_path> Options: --dim NUM Dimensionality of eigenvectors [default: 500] --neg NUM Number of negative samples; subtracts its log from PMI [default: 1] --pos NUM Number of positive samples; add its log to PMI [default: 1] --cds NUM Context distribution smoothing [default: 0.75] --randomized Use randomized SVD --normalized Use normalized embedder --oversample NUM Number of oversamples in randomized SVD [default: 10] --power_iter NUM Number of iterations of power method in randomized SVD [default: 2] """) start = time.time() counts_path = args['<counts_path>'] dim = int(args['--dim']) neg = int(args['--neg']) pos = int(args['--pos']) cds = float(args['--cds']) randomized = args['--randomized'] normalized = args['--normalized'] oversample = int(args['--oversample']) power_iter = int(args['--power_iter']) output_path = counts_path + "_svd_dim=%d_neg=%d_pos=%d_cds=%.2f" % (dim, neg, pos, cds) if randomized: output_path += "_rand_oversample=%d_power_iter=%d" % (oversample, power_iter) if normalized: output_path += "_normalized_power_iter=%d" % power_iter logging.basicConfig(filename=output_path + ".log", filemode="w", level=logging.DEBUG) logging.getLogger().addHandler(logging.StreamHandler()) _, iw = load_vocabulary(counts_path + '.words.vocab') adjacency_matrix = load_adjacency_matrix(counts_path) ppmi = build_ppmi_matrix(adjacency_matrix, cds, neg, pos) start_learning = time.time() logging.info("Starting SVD") if randomized: # ppmi = normalize(ppmi, norm='l2', axis=1) s, ut = randomized_eigh(ppmi, dim, oversample, power_iter, row_normalized=normalized) elif normalized: ut = normalized_embedder(ppmi, dim, power_iter) s = np.zeros(dim) else: # ppmi = normalize(ppmi, norm='l2', axis=1) ut, s, _ = sparsesvd(ppmi.tocsc(), dim) ut = ut.T logging.info("Time elapsed on learning: %f" % (time.time() - start_learning)) np.save(output_path + '.vecs.npy', ut) np.save(output_path + '.vals.npy', s) save_vocabulary(output_path + '.words.vocab', iw) logging.info("Time elapsed: %f" % (time.time() - start))
def lsa( ): from sparsesvd import sparsesvd from numpy import array import scipy.sparse as sp # calculate svd and perform lsa print "######## READING TERM DOC MATRIX #########" termDocEntries = pickle.load(open(outfile +"/tdm.p" ,"rb")) id2title = pickle.load(open(outfile + "/id_file.p","rb")) word2id = pickle.load(open(outfile + "/word_id.p","rb")) fileCount = len(id2title) #fileCount = 60000 vocab_size = len(word2id) print "######## READING COMPLETE #########" I = array([ i for ((i,j),v) in termDocEntries] ) J = array([ j for ((i,j),v) in termDocEntries] ) V = array([ v for ((i,j),v) in termDocEntries] ) shape = (fileCount, vocab_size) print "Dimension of TDM is : ", shape print "######## STARTING LSA #########" termDocMatrix = sp.csc_matrix( (V,(I,J)), shape= (fileCount, vocab_size ), dtype=np.float32) UT , S, V = sparsesvd(termDocMatrix, 300) (m1,m2) = UT.T.shape S1 = np.zeros((m2,m2), dtype=np.float32) for i in range(m2): S1[i][i] = S[i] US = np.dot(UT.T, S1) print m1, m2 (n1,n2) = V.shape pickle.dump( US , open( outfile + "/u_sigma.p", "wb" ) ) pickle.dump( V.T , open( outfile + "/v.p", "wb" ) ) print "######## LSA COMPLETE #########"
def applySvd(self): len_row = max(self.array_row) + 1 len_col = max(self.array_col) + 1 print "Applying SVD with ROW: " + str(len_row) + " and COL: " + str(len_col) sparse_matrix = scipy.sparse.csc_matrix( (self.array_data, (self.array_row, self.array_col)), shape=(len_row, len_col) ) print "sparsed matrix" Ut, Sigma, Vt = sparsesvd(sparse_matrix, self.svd_dimension) print "U Sigma Vt done!" sparse_matrix = array(0) print "Mounting Matrix SVD" self.svd_matrix = numpy.dot(Ut.T, numpy.dot(numpy.diag(Sigma), Vt)) print "Done!" print Ut.T print "\n" print Sigma print "\n" print Vt print "\n" print self.svd_matrix.T print "\n" Ut = None Sigma = None Vt = None
def arun(corpus, dictionary, min_topics=10, max_topics=21, step=5): print "Arun runing" output = [] for i in range(min_topics, max_topics, step): lda = LDA(dictionary, corpus, i, "lda20/lda_training_" + str(i)) print "Модель построена/загружена" m1 = lda.expElogbeta # U, cm1, V = np.linalg.svd(m1) smat = scipy.sparse.csc_matrix(m1) # convert to sparse CSC format U, cm1, V = sparsesvd(smat, i + 30) # do SVD, asking for 100 factors print "sparsesvd сделано" #Document-topic matrix lda_topics = lda[my_corpus] m2 = matutils.corpus2dense(lda_topics, lda.num_topics).transpose() cm2 = l.dot(m2) cm2 = cm2 + 0.0001 print "cm2norm begin" cm2norm = np.linalg.norm(l) print "cm2norm end" cm2 = cm2/cm2norm print len(cm1), len(cm2) kl = sym_kl(cm1, cm2) output.append((i, kl)) print i, kl print output return output
def matrixsvd(self): svd_matrix = self.projection_matrix.tocsc() if self.svd is 'scipy': Utemp, Stemp, VTtemp = ssl.svds(svd_matrix.tocsc(), k=(int(self.projection_matrix.tocsr().shape[0] * self.precision) / 100)) UT = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) elif self.svd is 'sparsesvd': (UT, S, VT) = sparsesvd(svd_matrix, (int(svd_matrix.shape[0] * self.precision) / 100)) elif self.svd is 'fast': Utemp, Stemp, VTtemp = fast_svd(svd_matrix, (int(self.projection_matrix.tocsr().shape[0] * self.precision) / 100)) UT = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) else: Utemp, Stemp, VTtemp = np.linalg.svd(svd_matrix.todense()) UT = np.nan_to_num(Utemp.transpose()) S = np.nan_to_num(Stemp) VT = np.nan_to_num(VTtemp) return UT, S, VT
def learn(mat): print "Starting learning process..." start_time = time.time() user_mat, axis_weights, movie_mat = sparsesvd(mat, NUM_COMPONENTS) print "Matrix decomposition complete (elapsed time: %f s)." % (time.time() - start_time) print "Learning process complete." return (user_mat, axis_weights, movie_mat)
def main(): #=========================================================================== # mat = numpy.random.rand(300, 300) # smat = scipy.sparse.csc_matrix(mat) # ut, s, vt = sparsesvd(smat,100) # tmp=numpy.diag(s) # test=numpy.dot(ut.T, numpy.dot(numpy.diag(s), vt))#vt=(300,300), ut=(300,300), s=(300,1) # u2, s2, v2=svds(mat, k=100) # # print "" #=========================================================================== #ut, s, vt = sparsesvd(smat,100) # do SVD, asking for 100 factors # ut - Unitary matrices. #s -The singular values for every matrix, sorted in descending order. #vt - Unitary matrices #assert numpy.allclose(mat, numpy.dot(ut.T, numpy.dot(numpy.diag(s), vt))) #test if mat is close to numpy.dot(ut.T, numpy.dot(numpy.diag(s), vt)) ################################################################################################################ mat1 = ss.load_npz( '/home/ira/Dropbox/IraTechnion/Patterns_Research/sp_sg/mat_ppmi_round_allpats.npz' ) (nrows, ncols) = mat1.get_shape() #u1, s1, v1 = svds(mat1, k=500) u1, s1, v1 = sparsesvd(csc_matrix(mat1), 500) #v1(500,746K), u1(500,746K) s1[500,1] reduced_mat = numpy.dot(u1.T, numpy.diag(s1)) ss.save_npz('svd_reduced_mat_500_allpats', csr_matrix(reduced_mat)) print "I'm here"
def generate_archetypes(singer_resumes, archetype_count_k=20, cache_file=CACHE): """ Generate and write to disk an archetype matrix given a population """ # Generate a unique, ordered, list of characters characters = set() # Could optimized by using single comprehension for singer_resume in singer_resumes: characters.update(singer_resume) characters = list(characters) # Create a dict to lookup character index by id character_positions = dict() for i, character in enumerate(characters): character_positions[character] = i # Construct an empty matrix to populate dimensions = len(singer_resumes), len(characters) singer_matrix = scipy.sparse.lil_matrix(dimensions) # Populate the matrix for j, singer_resume in enumerate(singer_resumes): for character in singer_resume: position = character_positions[character] singer_matrix[j, position] = True # Convert matrix to a sparse matrix sparse_singer_matrix = scipy.sparse.csc_matrix(singer_matrix) # Do magic with maths U, s, V = sparsesvd(sparse_singer_matrix, archetype_count_k) archetypes = V # Cache the data for later use arrays = {CHARACTERS: character_positions, ARCHETYPES: archetypes} np.savez(cache_file, **arrays)
def single_line_update_svd(m_old, m_new): if m_old.shape != m_new.shape: print("\nSingel_line: Matrix shape don't match") shape = m_old.shape diff = m_new - m_old r, c = np.nonzero(diff) # print(r,c) x = diff[diff != 0] # print(x, x.shape) if x.shape[0] == shape[0]: # update colum col = c[0] a = x.reshape((shape[0], 1)) b = np.zeros((shape[1], 1)) b[c, 0] = 1 else: # update row row = r[0] a = np.zeros((shape[0], 1)) a[row, 0] = 1 b = x.reshape((shape[1], 1)) # print(np.allclose(m_new, m_old + [email protected])) temp = sps.csc_matrix(m_old) u, s, v = sparsesvd.sparsesvd(temp, min(shape)) u = u.T v = v.T s = np.diag(s) u_new, s_new, v_new = increment_svd(u, s, v, a, b) e = np.linalg.norm(m_new - (u_new @ s_new @ v_new.T), 2) print("Error is ", e)
def learn(mat): print "Starting learning process..." start_time = time.time() user_mat, axis_weights, movie_mat = sparsesvd(mat, NUM_COMPONENTS) print "Matrix decomposition complete (elapsed time: %f s)." % ( time.time() - start_time) print "Learning process complete." return (user_mat, axis_weights, movie_mat)
def test(): mat = sp.rand(200, 100, density=0.01) # create a random matrix smat = csc_matrix(mat) # convert to sparse CSC format ut, s, vt = sparsesvd(smat, 1) # do SVD, asking for 100 factors mat_prime = np.dot(ut.T, np.dot(np.diag(s), vt)) print (len(np.transpose(mat.nonzero()))) print (len(np.transpose(mat_prime.nonzero())))
def decompose(self): sigular_vals = 100 print "decomposing, with %d singular-value requested." % sigular_vals ut, s, vt = sparsesvd(csc_matrix(self.matrix), sigular_vals) print "s*vt:", os.linesep, numpy.dot(s, vt) print "ut:", os.linesep, ut print "s:", os.linesep, s print "vt", os.linesep, vt
def svd_reduction(dataArray, k, get="feature-latent"): sparseDataArray = csc_matrix(dataArray) ut, s, vt = sparsesvd(sparseDataArray, k) if get=="feature-latent": return np.matmul(dataArray.transpose(), ut.transpose()) else: return np.matmul(dataArray, vt.transpose())
def projections(self): """Get the set of vectors for all URIs""" if self._ut is None: self._ut, self._s, self._vt = sparsesvd(self._adjacency, self._rank) (self._ut_shape, self._s_shape, self._vt_shape) = (self._ut.shape, self._s.shape, self._vt.shape) return self._ut.T
def _compute_svd(self, normalize_data = True): self.logger.info('Computing the Singular Value Decomposition of the relation matrix') if normalize_data: self.data_normalization() self.relationship_matrix_csc = self.relationship_matrix.tocsc() self.svd_u, self.svd_s, self.svd_v = sparsesvd(self.relationship_matrix_csc, self.dimensionality)
def learnProjection(dataset, pivotsMethod, n): """ Learn the projection matrix and store it to a file. """ h = 50 # no. of SVD dimensions. #n = 500 # no. of pivots. # Parameters to reduce the number of features in the tail # domainTh = {'books':5, 'dvd':5, 'kitchen':5, 'electronics':5} # Load pivots. pivotsFile = "../work/%s/obj/%s" % (dataset, pivotsMethod) features = pi.load_stored_obj(pivotsFile) pivots = dict(features[:n]).keys() print "selecting top-%d features in %s as pivots" % (len(pivots), pivotsMethod) # Load features and get domain specific features fname = "../work/%s/obj/freq" % (dataset) if "un_" in pivotsMethod: fname = "../work/%s/obj/un_freq" % (dataset) features = pi.load_stored_obj(fname) feats = dict(features) # print feats.keys() # DSwords = [item for item in feats if item not in pivots] feats = feats.keys() # Load train vectors. print "Loading Training vectors...", startTime = time.time() vects = [] vects.extend(loadFeatureVecors("../data/%s/train-sentences" % dataset, feats)) endTime = time.time() print "%ss" % str(round(endTime-startTime, 2)) print "Total no. of documents =", len(vects) print "Total no. of features =", len(feats) # Learn pivot predictors. print "Learning Pivot Predictors.." startTime = time.time() M = sp.lil_matrix((len(feats), len(pivots)), dtype=np.float) for (j, w) in enumerate(pivots): print "%d of %d %s" % (j, len(pivots), w) for (feat, val) in getWeightVector(w, vects): i = feats.index(feat) M[i,j] = val endTime = time.time() print "Took %ss" % str(round(endTime-startTime, 2)) # Perform SVD on M print "Perform SVD on the weight matrix...", startTime = time.time() ut, s, vt = sparsesvd(M.tocsc(), h) endTime = time.time() print "%ss" % str(round(endTime-startTime, 2)) sio.savemat("../work/%s/proj_scl.mat" % (dataset), {'proj':ut.T}) pass
def testBeyondAccurracyMetrics(train_filename, eval_item_filename, user_means_filename): logging.info('testing beyond-accuracy topNLists with data files {0}; {1}; {2}...'.format(train_filename, eval_item_filename, user_means_filename)) train_data = trainData.TrainData(train_filename, user_means_filename) _, _, Q = sparsesvd(train_data.rating_matrix.tocsc(), config.FACTOR_MODEL_SIZE) with open(eval_item_filename,'rb') as eval_file: for line in eval_file: data = line.split('\t') user_id = data[0] user_index = train_data.getUserIndex(user_id) if len(train_data.getUserProfileByIndex(user_index)) < 1: continue ground_truth_items = data[1].split(',') random_unrated_items = data[2].rstrip('\n').split(',') evaluation_item_ids = ground_truth_items + random_unrated_items rec_list_szie = config.RECOMMENDATION_LIST_SIZE * config.DIVERSIFICATION_CANDIDATES_FACTOR # predictions = train_data.getFactorBasedRecommendations(user_id, Q, evaluation_item_ids) # top_recs = topNLists.getTopNList(predictions, rec_list_szie) # predictions_ib = train_data.getItemBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized') # top_recs_ib = topNLists.getTopNList(predictions_ib, rec_list_szie) # predictions = library_recommender.recommend_items(mrec_train_data.X, int(user_id)-config.MREC_INDEX_OFFSET, max_items=10000, return_scores=True) # top_recs = topNLists.getTopNList(predictions, rec_list_szie, evaluation_item_ids) predictions_ub = train_data.getUserBasedRecommendations(user_id, evaluation_item_ids, 'non_normalized') top_recs_ub = topNLists.getTopNList(predictions_ub, rec_list_szie) # print 'user',user_id # print top_recs_ib, top_recs_ub # rare = train_data.getPopularityInfo()[:10] # pop = train_data.getPopularityInfo()[-10:] top_recs = top_recs_ub print 'diversity_ratings',diversity.getListDiversity(train_data, top_recs, 'div_r') print 'diversity_content',diversity.getListDiversity(train_data, top_recs, 'div_c') print 'content',serendipity.getListSerendipity(train_data, user_index, top_recs, 'sur_c') # print 'rare cooccurrence',serendipity.getListSerendipity(train_data, user_index, rare, 'sur_r') # print 'rare cooccurrence normalized',serendipity.getListSerendipity(train_data, user_index, rare, 'sur_r_n') # # print 'pop cooccurrence',serendipity.getListSerendipity(train_data, user_index, pop, 'sur_r') # print 'pop cooccurrence normalized',serendipity.getListSerendipity(train_data, user_index, pop, 'sur_r_n') # # print 'rare novelty',novelty.getListNovelty(train_data, rare) # # print 'pop novelty',novelty.getListNovelty(train_data, pop) print '------------------------------'
def main(size, thr, ns, sppmi, f_in, f_out): size = int(size) thr = int(thr) ns = float(ns) sppmi = int(sppmi) #1: sppmi 0: raw print "Input text file: ", f_in print "Building dict..." vocab_count, text_num = build_dict(f_in, thr) alltokens = vocab_count.keys() vocab_id = dict((t, i) for i, t in enumerate(alltokens)) vocab_size = len(vocab_id) print "the number of texts: ", text_num print "vocabulary size: ", vocab_size train_num = sum(vocab_count.values()) print "The number of tokens: ", train_num i = 0 row = [] col = [] data = [] for l in open(f_in).readlines(): tokens = l.split() indexes = list(np.zeros(vocab_size)) for t in tokens: try: if sppmi == 1: indexes[vocab_id[t]] += train_num / ns / (len(tokens) * vocab_count[t]) else: indexes[vocab_id[t]] += 1 except KeyError: pass if sppmi == 1: for j in range(len(indexes)): if indexes[j] > 1: # only positive values are retained row.append(i) col.append(j) data.append(np.log(indexes[j])) else: for j in range(len(indexes)): if indexes[j] > 0: row.append(i) col.append(j) data.append(indexes[j]) i += 1 print "the size of the co-occurrence matrix of term-document type doc_num, vocab_size:", text_num, vocab_size s_co_mat = scipy.sparse.csc_matrix( (np.array(data), (np.array(row), np.array(col))), shape=(text_num, vocab_size)) ut, s, vt = sparsesvd(s_co_mat, size) ut = ut.transpose() f = open(f_out, "w") for i in range(ut.shape[0]): f.write(str(i) + " ") for j in range(ut.shape[1]): f.write(str(ut[i, j]) + " ") f.write("\n")
def __factorize_rating_matrix(self): mat = sparse.lil_matrix((self.m, self.n)) for user in self.user_ratings.iterkeys(): for item in self.user_ratings[user]: mat[self.user_positions[user], self.item_positions[item]] = self.user_ratings[user][item] u, s, q = sparsesvd(sparse.csc_matrix(mat), self.num_facs) return u.T, numpy.diag(s), q.T
def sparseSVD(D): import scipy.sparse try: import sparsesvd except: print 'bummer ... better get sparsesvd' exit(0) Ds = scipy.sparse.csc_matrix(D) a = sparsesvd.sparsesvd(Ds,Ds.shape[0]) return a
def run(self): denseMat = loadMatFile(self.rawMatFile) sparseMat = scipy.sparse.csc_matrix(denseMat) if int(len(denseMat)) >= self.cutOffSVD : ut, s, vt = sparsesvd.sparsesvd(sparseMat, self.cutOffSVD) dump2File(vt, self.VtFile) reducedMatrix = computeReduction(vt, denseMat, self.cutOffSVD) dump2File(reducedMatrix, self.redMatFile) else : print('--> not enough contexts')
def transform_data(X, X_test): X_all = np.vstack((X,X_test)) tfidf = feature_extraction.text.TfidfTransformer() X_all = tfidf.fit_transform(X_all).toarray() X_all_sparse = scipy.sparse.csc_matrix(X_all) U, s, V = sparsesvd(X_all_sparse, 60) print U.shape, s.shape, V.shape S = np.diag(s) X_all = np.dot(np.transpose(U), np.dot(S, V)) return X_all[0:X.shape[0],:], X_all[X.shape[0]:,:]
def __init__(self, corpus): term_doc = TermDoc() for document in corpus: # Tokenize/stem each document tokens = process_document(document) # Add it to the sparse term-document matrix term_doc.add_document(document, tokens) print len(term_doc._words), len(term_doc._documents) # Calculate the SVD self.T, self.s, self.D = sparsesvd(term_doc._matrix.as_csc(), 300)
def testLearnModel2(self): X = scipy.sparse.rand(10, 10, 0.2) X = X.tocsc() lmbdas = numpy.array([10.0, 0.0]) eps = 0.01 k = 9 #Check out singular values U, s, V = sparsesvd(X.tocsc(), k) softImpute = SoftImpute(lmbdas, eps, k) ZList = softImpute.learnModel2(X) #Test that when lambda=0 get approx original matrix back X2 = ZList[1].todense() nptst.assert_almost_equal(X.todense(), X2) #When lambda is greater or equal to largest singular value, get 0 U, s, V = sparsesvd(X.tocsc(), k) lmbdas = numpy.array([numpy.max(s)]) softImpute = SoftImpute(lmbdas, eps, k) Z = softImpute.learnModel2(X) self.assertEquals(numpy.linalg.norm(Z.todense()), 0) #Check solution for medium values of lambda eps = 0.1 lmbdas = numpy.array([0.1, 0.2, 0.5, 1.0]) softImpute = SoftImpute(lmbdas, eps, k) ZList = softImpute.learnModel2(X) for j, Z in enumerate(ZList): Z = Z.todense() Zomega = numpy.zeros(X.shape) rowInds, colInds = X.nonzero() for i in range(X.nonzero()[0].shape[0]): Zomega[rowInds[i], colInds[i]] = Z[rowInds[i], colInds[i]] U, s, V = ExpSU.SparseUtils.svdSoft(numpy.array(X-Zomega+Z), lmbdas[j]) tol = 0.1 self.assertTrue(numpy.linalg.norm(Z -(U*s).dot(V.T))**2 < tol)
def process_SVD1(inputFileName, outputFileName, n, p): """ Peform SVD1. """ mat, rowids = loadMatrix(inputFileName) X = mat.tocsc() ut, s, vt = sparsesvd(X, n) A = np.dot(ut.T, np.diag(s ** p)) saveMatrix(A, rowids, outputFileName) mmwrite("%s.ut" % inputFileName, ut) np.savetxt("%s.s" % inputFileName, s) mmwrite("%s.vt" % inputFileName, vt) pass
def psp_pseudoinverse(Mat, precision): list_nz = (Mat.sum(axis=1) == 1) list_mat = [] for i in range(list_nz): if list_nz[i]: list_mat.append(i) temp_Mat = Mat[list_mat, :] matrix = spmatrix.ll_mat(temp_Mat.shape[0], temp_Mat.shape[1]) matrix.update_add_at(temp_Mat.tocoo().data, temp_Mat.tocoo().row, temp_Mat.tocoo().col) if matrix.shape[0] <= matrix.shape[1]: k = int((precision * matrix.shape[0]) / 100) ut, s, vt = sparsesvd(matrix.tocsc(), k) UT = ss.csr_matrix(ut) SI = ss.csr_matrix(np.diag(1 / s)) VT = ss.csr_matrix(vt) temp_matrix = spmatrixmul(VT.transpose(), SI) pinv_matrix = spmatrixmul(temp_matrix, UT) del ut, s, vt, UT, SI, VT, temp_matrix else: k = int((precision * matrix.transpose().shape[0]) / 100) ut, s, vt = sparsesvd(matrix.transpose().tocsc(), k) UT = ss.csr_matrix(ut) SI = ss.csr_matrix(np.diag(1 / s)) VT = ss.csr_matrix(vt) temp_matrix = spmatrixmul(UT.transpose(), SI) pinv_matrix = spmatrixmul(temp_matrix, VT) del ut, s, vt, UT, SI, VT, temp_matrix return pinv_matrix.tocsr()
def debug(): """ Test the various functions implemented in this module. """ mat, rowids = loadMatrix("../work/testMatrix") #convertPPMI(mat) #saveMatrix(mat, rowids, "../work/pmiMatrix") X = mat.tocsc() ut, s, vt = sparsesvd(X, 50) #print allclose(X, np.dot(ut.T, np.dot(np.diag(s), vt))) A = np.dot(ut.T, np.diag(s)) saveMatrix(A, rowids, "../work/featMatrix") pass
def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, extra_dims=P2_EXTRA_DIMS, dtype=np.float64): """Construct the (U, S) projection from a corpus. Parameters ---------- m : int Number of features (terms) in the corpus. k : int Desired rank of the decomposed matrix. docs : {iterable of list of (int, float), scipy.sparse.csc} Corpus in BoW format or as sparse matrix. use_svdlibc : bool, optional If True - will use `sparsesvd library <https://pypi.python.org/pypi/sparsesvd/>`_, otherwise - our own version will be used. power_iters: int, optional Number of power iteration steps to be used. Tune to improve accuracy. extra_dims : int, optional Extra samples to be used besides the rank `k`. Tune to improve accuracy. dtype : numpy.dtype, optional Enforces a type for elements of the decomposed matrix. """ self.m, self.k = m, k self.power_iters = power_iters self.extra_dims = extra_dims if docs is not None: # base case decomposition: given a job `docs`, compute its decomposition, # *in-core*. if not use_svdlibc: u, s = stochastic_svd( docs, k, chunksize=sys.maxsize, num_terms=m, power_iters=self.power_iters, extra_dims=self.extra_dims, dtype=dtype) else: try: import sparsesvd except ImportError: raise ImportError("`sparsesvd` module requested but not found; run `easy_install sparsesvd`") logger.info("computing sparse SVD of %s matrix", str(docs.shape)) if not scipy.sparse.issparse(docs): docs = matutils.corpus2csc(docs) # ask for extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested ut, s, vt = sparsesvd.sparsesvd(docs, k + 30) u = ut.T del ut, vt k = clip_spectrum(s ** 2, self.k) self.u = u[:, :k].copy() self.s = s[:k].copy() else: self.u, self.s = None, None
def setupencodersandimages(listoffiles, scalew, scaleh, numgabor): combined=[] for file_ in listoffiles: combined.append(converttoarray(file_, scalew, scaleh)) imgs=numpy.array([img/numpy.linalg.norm(img) for img in combined]).T csc=csc_matrix(imgs) ut, S, vt=sparsesvd(csc, len(listoffiles)) M=numpy.diag([numpy.linalg.norm(ut[i:]) for i in range(ut.shape[0])]) #W is M inverse UW=numpy.dot(ut.T, numpy.linalg.inv(M)) MSvt=numpy.dot(M, numpy.dot(numpy.diag(S), vt)) gaborenc=[makerandomgabor(scalew*scaleh) for i in range(numgabor)] gaborenc=[(1/numpy.linalg.norm(i).flatten())*i for i in gaborenc] return imgs, UW, MSvt, gaborenc
def convertor2matrix(self, rank): self.loadDataFileMovieBased(moiveRatingTrainsetFilename) self.loadDataFile(moiveRatingTrainsetFilename) self.matrix = [[0 for x in range(0, len(self.allMovieRatingRecord))] for y in range(0, len(self.allUserRatingRecord))] self.users = list(self.allUserRatingRecord.keys()) self.movies = list(self.allMovieRatingRecord) matrix = self.matrix for u in range(len(self.users)): user = self.users[u] userEntry = self.allUserRatingRecord[user] for movie in userEntry: matrix[u][self.movies.index(movie)] = int(userEntry[movie][0]) avg = 0.0 userAvg = [] for i in range(len(matrix)): c = 0.0 s = 0.0 for j in range(len(matrix[i])): if matrix[i][j] != 0: c += 1 s += matrix[i][j] userAvg.append(s/c) movieAvg=[] for i in range(len(matrix[0])): c = 0.0 s = 0.0 for j in range(len(matrix)): if matrix[j][i] != 0: c += 1 s += matrix[j][i] if c == 0.0: print self.movies[i] movieAvg.append(s/c) for i in range(len(matrix)): for j in range(len(matrix[i])): if matrix[i][j] != 0: matrix[i][j] -= userAvg[i] smat = scipy.sparse.csc_matrix(matrix) u, s, v = sparsesvd(smat,rank) u = u.transpose() s = diag(s) res = dot(dot(u,s),v) for i in range(len(res)): for j in range(len(res[i])): res[i][j] += userAvg[i] self.result = res
def run(self): self.context = loadContextFile(self.contextFile) self.lexicon = buildLexicon(self.context, self.stopList, self.lexiconCutoff) self.lexiconSorted = getSortedLexicon(self.lexicon) self.lexiconPosition = buildLexiconPositionTable(self.lexiconSorted) writeLexicon(self.lexiconFile, self.lexicon, self.lexiconSorted) self.denseMat = computeMatrix(int(self.winLength) , self.lexicon, self.lexiconPosition, self.context) if int(len(self.denseMat)) >= self.cutOffSVD : self.sparseMat = scipy.sparse.csc_matrix(self.denseMat) ut, s, self.vt = sparsesvd.sparsesvd(self.sparseMat, self.cutOffSVD) dump2File(self.vt, self.VtFile) self.reducedMatrix = computeReduction(self.vt, self.denseMat, self.cutOffSVD) dump2File(self.reducedMatrix, self.redMatFile) else : print('--> not enough contexts')