示例#1
0
def selectAnchorWords(DocWordMat, Q, K, params):
    from anchors import findAnchors

    if not str(type(DocWordMat)).count('csc_matrix') > 0:
        raise NotImplementedError('Need CSC matrix')

    nDocsPerWord = np.diff(DocWordMat.indptr)
    candidateWords = np.flatnonzero(nDocsPerWord > params.minDocPerWord)

    anchors = findAnchors(Q, K, params, candidateWords.tolist())
    return anchors
print len(candidate_anchors), "candidates"

#forms Q matrix from document-word matrix
Q = generate_Q_matrix(M)

vocab = file(vocab_file).read().strip().split()

#check that Q sum is 1 or close to it
print "Q sum is", Q.sum()
V = Q.shape[0]
print "done reading documents"

#find anchors- this step uses a random projection
#into low dimensional space
anchors = findAnchors(Q, K, params, candidate_anchors)
print "anchors are:"
for i, a in enumerate(anchors):
    print i, vocab[a]

#recover topics
A, topic_likelihoods = do_recovery(Q, anchors, loss, params) 
print "done recovering"

np.savetxt(outfile+".A", A)
np.savetxt(outfile+".topic_likelihoods", topic_likelihoods)

#display
f = file(outfile+".topwords", 'w')
for k in xrange(K):
    topwords = np.argsort(A[:, k])[-params.top_words:][::-1]
示例#3
0
文件: prepare.py 项目: yangkuoone/nmf
def anchor_words(D, loss='L2', params=config.default_config()):
    Q = generate_Q_matrix(D * 100)
    anchors = findAnchors(Q, params['T'], params)
    W, topic_likelihoods = do_recovery(Q, anchors, loss, params)
    return W
    def run(self):
        params = self.params

        if isinstance(params.infile, basestr):
            M = scipy.io.loadmat(params.infile)['M']
        else:
            M = params.infile
        assert sparse.isspmatrix_csc(M), "Must provide a sparse CSC matrix"

        print("Input matrix shape: {}".format(M.shape))

        if isinstance(params.vocab_file, basestr):
            with open(params.vocab_file) as f:
                vocab = f.read().strip().split()
        else:
            vocab = params.vocab_file
        assert np.iterable(vocab), "Must provide an iterable vocab"

        assert M.shape[0] == len(vocab), \
            "Number of rows must correspond to vocab size: {} rows vs {} vocab words" \
            .format(M.shape[0], len(vocab))

        #only accept anchors that appear in a significant number of docs
        print("identifying candidate anchors")
        candidate_anchors = []
        for i in range(M.shape[0]):
            if len(np.nonzero(M[i, :])[1]) > params.anchor_thresh:
                candidate_anchors.append(i)

        print(len(candidate_anchors), "candidates")

        #forms Q matrix from document-word matrix
        Q = generate_Q_matrix(M)

        # Save copy of unnormalized Q, before any normalizations happen
        self.Q_unnormalized = Q.copy()

        #check that Q sum is 1 or close to it
        print("Q sum is", Q.sum())
        V = Q.shape[0]
        print("done reading documents")

        #find anchors- this step uses a random projection
        #into low dimensional space
        anchors = findAnchors(Q, params, candidate_anchors)
        print("anchors are:")
        for i, a in enumerate(anchors):
            print(i, vocab[a])

        #recover topics
        A, topic_likelihoods = do_recovery(Q, anchors, params)
        print("done recovering")

        output_streams = [sys.stdout]
        output_file_handle = None
        if params.outfile is not None:
            np.savetxt(params.outfile+".A", A)
            np.savetxt(params.outfile+".topic_likelihoods", topic_likelihoods)
            output_file_handle = open(params.outfile+".topwords", 'w')
            output_streams.append(output_file_handle)

        def print_multiple(*args, **kwargs):
            # Print the same info to multiple output streams
            for f in output_streams:
                print(*args, file=f, **kwargs)

        # Display top words per topic
        all_topwords = []
        for k in range(params.K):
            topwords = np.argsort(A[:, k])[-params.top_words:][::-1]
            print_multiple(vocab[anchors[k]], ':', end=' ')
            for w in topwords:
                print_multiple(vocab[w], end=' ')
            print_multiple("")
            all_topwords.append(TopWordsSummary(
                topic_index = k,
                anchor_word_index = anchors[k],
                anchor_word = vocab[anchors[k]],
                top_word_indices = topwords,
                top_words = [vocab[w] for w in topwords]))

        if params.outfile is not None:
            output_file_handle.close()

        # make some results available as attributes of "self"
        self.Q = Q
        self.M = M
        self.A = A
        self._R = None
        self.topic_likelihoods = topic_likelihoods
        self.candidate_anchors = candidate_anchors
        self.anchors = anchors
        self.vocab = vocab
        self.all_topwords = all_topwords
#forms Q matrix from document-word matrix
Q = generate_Q_matrix(M)

vocab = file(vocab_file).read().strip().split()

#check that Q sum is 1 or close to it
print "Q sum is", Q.sum()
V = Q.shape[0]
print "done reading documents"

#find anchors- this step uses a random projection
#into low dimensional space

anchor_logfile = file(params.log_prefix + '.anchors', 'w')
anchors = findAnchors(Q, K, params, candidate_anchors, anchor_logfile)
print "anchors are:"
print >> anchor_logfile, "anchors are:"
for i, a in enumerate(anchors):
    print i, vocab[a]
    print >> anchor_logfile, i, vocab[a]
anchor_logfile.close()

#recover topics
A, topic_likelihoods, objective = do_recovery(Q, anchors, loss, params)
print "done recovering"
print "avg objective function during recovery using", K, "topics:", objective

np.savetxt(outfile + ".A", A)
np.savetxt(outfile + ".topic_likelihoods", topic_likelihoods)
#forms Q matrix from document-word matrix
Q = generate_Q_matrix(M)

vocab = file(vocab_file).read().strip().split()

#check that Q sum is 1 or close to it
print "Q sum is", Q.sum()
V = Q.shape[0]
print "done reading documents"

#find anchors- this step uses a random projection
#into low dimensional space

anchor_logfile = file(params.log_prefix+'.anchors', 'w')
anchors = findAnchors(Q, K, params, candidate_anchors, anchor_logfile)
print "anchors are:"
print >>anchor_logfile, "anchors are:"
for i, a in enumerate(anchors):
    print i, vocab[a]
    print >>anchor_logfile, i, vocab[a]
anchor_logfile.close()

#recover topics
A, topic_likelihoods,objective = do_recovery(Q, anchors, loss, params) 
print "done recovering"
print "avg objective function during recovery using", K, "topics:", objective

np.savetxt(outfile+".A", A)
np.savetxt(outfile+".topic_likelihoods", topic_likelihoods)
示例#7
0
print(len(candidate_anchors), "candidates")

# forms Q matrix from document-word matrix
Q = generate_Q_matrix(M)

vocab = open(vocab_file).read().strip().split()

# check that Q sum is 1 or close to it
print("Q sum is", Q.sum())
V = Q.shape[0]
print("done reading documents")

# find anchors- this step uses a random projection
# into low dimensional space
anchors = findAnchors(Q, K, params, candidate_anchors)
print("anchors are:")
for i, a in enumerate(anchors):
    print(i, vocab[a])

# recover topics
A, topic_likelihoods = do_recovery(Q, anchors, loss, params)
print("done recovering")

np.savetxt(outfile + ".A", A)
np.savetxt(outfile + ".topic_likelihoods", topic_likelihoods)

# display
with open(outfile + ".topwords", "w") as f:
    for k in range(K):
        topwords = np.argsort(A[:, k])[-params.top_words:][::-1]