def findAnchorTopics_Orig(Data, K=10, loss='L2', seed=0, lowerDim=1000, minDocPerWord=0, eps=1e-4, doRecover=1): """ Estimate and return K topics using anchor word method Returns ------- topics : numpy 2D array, size K x V """ from Q_matrix import generate_Q_matrix from fastRecover import do_recovery params = Params(seed=seed, lowerDim=lowerDim, minDocPerWord=minDocPerWord, eps=eps) assert isinstance(Data, bnpy.data.DataObj) DocWordMat = Data.getSparseDocTypeCountMatrix() if not str(type(DocWordMat)).count('csr_matrix') > 0: raise NotImplementedError('Need CSR matrix') Q = generate_Q_matrix(DocWordMat.copy().T) anchors = selectAnchorWords(DocWordMat.tocsc(), Q, K, params) if doRecover: topics, topic_likelihoods = do_recovery(Q, anchors, loss, params) topics = topics.T topics = topics / topics.sum(axis=1)[:, np.newaxis] return topics else: return Q, anchors
params = Params(settings_file) params.dictionary_file = vocab_file M = scipy.io.loadmat(infile)['M'] print "identifying candidate anchors" candidate_anchors = [] #only accept anchors that appear in a significant number of docs for i in xrange(M.shape[0]): if len(np.nonzero(M[i, :])[1]) > params.anchor_thresh: candidate_anchors.append(i) print len(candidate_anchors), "candidates" #forms Q matrix from document-word matrix Q = generate_Q_matrix(M) vocab = file(vocab_file).read().strip().split() #check that Q sum is 1 or close to it print "Q sum is", Q.sum() V = Q.shape[0] print "done reading documents" #find anchors- this step uses a random projection #into low dimensional space anchors = findAnchors(Q, K, params, candidate_anchors) print "anchors are:" for i, a in enumerate(anchors): print i, vocab[a]
def anchor_words(D, loss='L2', params=config.default_config()): Q = generate_Q_matrix(D * 100) anchors = findAnchors(Q, params['T'], params) W, topic_likelihoods = do_recovery(Q, anchors, loss, params) return W
def run(self): params = self.params if isinstance(params.infile, basestr): M = scipy.io.loadmat(params.infile)['M'] else: M = params.infile assert sparse.isspmatrix_csc(M), "Must provide a sparse CSC matrix" print("Input matrix shape: {}".format(M.shape)) if isinstance(params.vocab_file, basestr): with open(params.vocab_file) as f: vocab = f.read().strip().split() else: vocab = params.vocab_file assert np.iterable(vocab), "Must provide an iterable vocab" assert M.shape[0] == len(vocab), \ "Number of rows must correspond to vocab size: {} rows vs {} vocab words" \ .format(M.shape[0], len(vocab)) #only accept anchors that appear in a significant number of docs print("identifying candidate anchors") candidate_anchors = [] for i in range(M.shape[0]): if len(np.nonzero(M[i, :])[1]) > params.anchor_thresh: candidate_anchors.append(i) print(len(candidate_anchors), "candidates") #forms Q matrix from document-word matrix Q = generate_Q_matrix(M) # Save copy of unnormalized Q, before any normalizations happen self.Q_unnormalized = Q.copy() #check that Q sum is 1 or close to it print("Q sum is", Q.sum()) V = Q.shape[0] print("done reading documents") #find anchors- this step uses a random projection #into low dimensional space anchors = findAnchors(Q, params, candidate_anchors) print("anchors are:") for i, a in enumerate(anchors): print(i, vocab[a]) #recover topics A, topic_likelihoods = do_recovery(Q, anchors, params) print("done recovering") output_streams = [sys.stdout] output_file_handle = None if params.outfile is not None: np.savetxt(params.outfile+".A", A) np.savetxt(params.outfile+".topic_likelihoods", topic_likelihoods) output_file_handle = open(params.outfile+".topwords", 'w') output_streams.append(output_file_handle) def print_multiple(*args, **kwargs): # Print the same info to multiple output streams for f in output_streams: print(*args, file=f, **kwargs) # Display top words per topic all_topwords = [] for k in range(params.K): topwords = np.argsort(A[:, k])[-params.top_words:][::-1] print_multiple(vocab[anchors[k]], ':', end=' ') for w in topwords: print_multiple(vocab[w], end=' ') print_multiple("") all_topwords.append(TopWordsSummary( topic_index = k, anchor_word_index = anchors[k], anchor_word = vocab[anchors[k]], top_word_indices = topwords, top_words = [vocab[w] for w in topwords])) if params.outfile is not None: output_file_handle.close() # make some results available as attributes of "self" self.Q = Q self.M = M self.A = A self._R = None self.topic_likelihoods = topic_likelihoods self.candidate_anchors = candidate_anchors self.anchors = anchors self.vocab = vocab self.all_topwords = all_topwords
#only accept anchors that appear in a significant number of docs for i in xrange(M.shape[0]): if len(np.nonzero(M[i, :])[1]) > params.anchor_thresh: candidate_anchors.append(i) print len(candidate_anchors), "candidates" if len(candidate_anchors) < K: print "*** ERROR: there are only", len( candidate_anchors), "candidate anchors and K=", K print "*** Currently only anchors that appear in more than", params.anchor_thresh, "documents are considered as candidates for anchors" print "*** You can change this in the settings file or try to learn a model with fewer anchors" sys.exit() #forms Q matrix from document-word matrix Q = generate_Q_matrix(M) vocab = file(vocab_file).read().strip().split() #check that Q sum is 1 or close to it print "Q sum is", Q.sum() V = Q.shape[0] print "done reading documents" #find anchors- this step uses a random projection #into low dimensional space anchor_logfile = file(params.log_prefix + '.anchors', 'w') anchors = findAnchors(Q, K, params, candidate_anchors, anchor_logfile) print "anchors are:" print >> anchor_logfile, "anchors are:"
vocab = file(vocab_file).read().strip().split() V = M.shape[0] prng = RandomState(params.seed) R = rp.Random_Matrix(V, params.new_dim, prng) #only accept anchors that appear in a significant number of docs print "identifying candidate anchors" candidate_anchors = [] for i in xrange(V): if len(np.nonzero(row_M[i, :])[1]) > params.anchor_thresh: candidate_anchors.append(i) print len(candidate_anchors), "candidates" Q = np.vstack( generate_Q_matrix(row_M, col_M, row_normalize=True, projection_matrix=R.T)) #row-by-row generation _, anchors = gs.Projection_Find(Q, K, candidate_anchors) print "anchors are:", anchors anchor_file = file(outfile + '.anchors', 'w') print >> anchor_file, "\t".join(["topic id", "word id", "word"]) for i, a in enumerate(anchors): print i, vocab[a] print >> anchor_file, "\t".join([str(x) for x in (i, a, vocab[a])]) anchor_file.close() #recover topics row_sums = np.array(row_M.sum(1)).reshape(V) #generate Q_matrix rows for anchors Q_A = np.vstack(
vocab = file(vocab_file).read().strip().split() V = M.shape[0] prng = RandomState(params.seed) R = rp.Random_Matrix(V, params.new_dim, prng) #only accept anchors that appear in a significant number of docs print "identifying candidate anchors" candidate_anchors = [] for i in xrange(V): if len(np.nonzero(row_M[i, :])[1]) > params.anchor_thresh: candidate_anchors.append(i) print len(candidate_anchors), "candidates" Q = np.vstack(generate_Q_matrix(row_M, col_M, row_normalize=True, projection_matrix=R.T)) #row-by-row generation _, anchors = gs.Projection_Find(Q, K, candidate_anchors) print "anchors are:", anchors anchor_file = file(outfile+'.anchors', 'w') print >>anchor_file, "\t".join(["topic id", "word id", "word"]) for i, a in enumerate(anchors): print i, vocab[a] print >>anchor_file, "\t".join([str(x) for x in (i,a,vocab[a])]) anchor_file.close() #recover topics row_sums = np.array(row_M.sum(1)).reshape(V) #generate Q_matrix rows for anchors Q_A = np.vstack(generate_Q_matrix(row_M, col_M, row_normalize=True, indices=anchors, projection_matrix=None))