Пример #1
0
def findAnchorTopics_Orig(Data,
                          K=10,
                          loss='L2',
                          seed=0,
                          lowerDim=1000,
                          minDocPerWord=0,
                          eps=1e-4,
                          doRecover=1):
    """ Estimate and return K topics using anchor word method

        Returns
        -------
        topics : numpy 2D array, size K x V

    """
    from Q_matrix import generate_Q_matrix
    from fastRecover import do_recovery

    params = Params(seed=seed,
                    lowerDim=lowerDim,
                    minDocPerWord=minDocPerWord,
                    eps=eps)

    assert isinstance(Data, bnpy.data.DataObj)
    DocWordMat = Data.getSparseDocTypeCountMatrix()

    if not str(type(DocWordMat)).count('csr_matrix') > 0:
        raise NotImplementedError('Need CSR matrix')

    Q = generate_Q_matrix(DocWordMat.copy().T)

    anchors = selectAnchorWords(DocWordMat.tocsc(), Q, K, params)

    if doRecover:
        topics, topic_likelihoods = do_recovery(Q, anchors, loss, params)
        topics = topics.T
        topics = topics / topics.sum(axis=1)[:, np.newaxis]
        return topics
    else:
        return Q, anchors
Пример #2
0
params = Params(settings_file)
params.dictionary_file = vocab_file
M = scipy.io.loadmat(infile)['M']
print "identifying candidate anchors"
candidate_anchors = []

#only accept anchors that appear in a significant number of docs
for i in xrange(M.shape[0]):
    if len(np.nonzero(M[i, :])[1]) > params.anchor_thresh:
        candidate_anchors.append(i)

print len(candidate_anchors), "candidates"

#forms Q matrix from document-word matrix
Q = generate_Q_matrix(M)

vocab = file(vocab_file).read().strip().split()

#check that Q sum is 1 or close to it
print "Q sum is", Q.sum()
V = Q.shape[0]
print "done reading documents"

#find anchors- this step uses a random projection
#into low dimensional space
anchors = findAnchors(Q, K, params, candidate_anchors)
print "anchors are:"
for i, a in enumerate(anchors):
    print i, vocab[a]
Пример #3
0
def anchor_words(D, loss='L2', params=config.default_config()):
    Q = generate_Q_matrix(D * 100)
    anchors = findAnchors(Q, params['T'], params)
    W, topic_likelihoods = do_recovery(Q, anchors, loss, params)
    return W
    def run(self):
        params = self.params

        if isinstance(params.infile, basestr):
            M = scipy.io.loadmat(params.infile)['M']
        else:
            M = params.infile
        assert sparse.isspmatrix_csc(M), "Must provide a sparse CSC matrix"

        print("Input matrix shape: {}".format(M.shape))

        if isinstance(params.vocab_file, basestr):
            with open(params.vocab_file) as f:
                vocab = f.read().strip().split()
        else:
            vocab = params.vocab_file
        assert np.iterable(vocab), "Must provide an iterable vocab"

        assert M.shape[0] == len(vocab), \
            "Number of rows must correspond to vocab size: {} rows vs {} vocab words" \
            .format(M.shape[0], len(vocab))

        #only accept anchors that appear in a significant number of docs
        print("identifying candidate anchors")
        candidate_anchors = []
        for i in range(M.shape[0]):
            if len(np.nonzero(M[i, :])[1]) > params.anchor_thresh:
                candidate_anchors.append(i)

        print(len(candidate_anchors), "candidates")

        #forms Q matrix from document-word matrix
        Q = generate_Q_matrix(M)

        # Save copy of unnormalized Q, before any normalizations happen
        self.Q_unnormalized = Q.copy()

        #check that Q sum is 1 or close to it
        print("Q sum is", Q.sum())
        V = Q.shape[0]
        print("done reading documents")

        #find anchors- this step uses a random projection
        #into low dimensional space
        anchors = findAnchors(Q, params, candidate_anchors)
        print("anchors are:")
        for i, a in enumerate(anchors):
            print(i, vocab[a])

        #recover topics
        A, topic_likelihoods = do_recovery(Q, anchors, params)
        print("done recovering")

        output_streams = [sys.stdout]
        output_file_handle = None
        if params.outfile is not None:
            np.savetxt(params.outfile+".A", A)
            np.savetxt(params.outfile+".topic_likelihoods", topic_likelihoods)
            output_file_handle = open(params.outfile+".topwords", 'w')
            output_streams.append(output_file_handle)

        def print_multiple(*args, **kwargs):
            # Print the same info to multiple output streams
            for f in output_streams:
                print(*args, file=f, **kwargs)

        # Display top words per topic
        all_topwords = []
        for k in range(params.K):
            topwords = np.argsort(A[:, k])[-params.top_words:][::-1]
            print_multiple(vocab[anchors[k]], ':', end=' ')
            for w in topwords:
                print_multiple(vocab[w], end=' ')
            print_multiple("")
            all_topwords.append(TopWordsSummary(
                topic_index = k,
                anchor_word_index = anchors[k],
                anchor_word = vocab[anchors[k]],
                top_word_indices = topwords,
                top_words = [vocab[w] for w in topwords]))

        if params.outfile is not None:
            output_file_handle.close()

        # make some results available as attributes of "self"
        self.Q = Q
        self.M = M
        self.A = A
        self._R = None
        self.topic_likelihoods = topic_likelihoods
        self.candidate_anchors = candidate_anchors
        self.anchors = anchors
        self.vocab = vocab
        self.all_topwords = all_topwords
#only accept anchors that appear in a significant number of docs
for i in xrange(M.shape[0]):
    if len(np.nonzero(M[i, :])[1]) > params.anchor_thresh:
        candidate_anchors.append(i)

print len(candidate_anchors), "candidates"

if len(candidate_anchors) < K:
    print "*** ERROR: there are only", len(
        candidate_anchors), "candidate anchors and K=", K
    print "*** Currently only anchors that appear in more than", params.anchor_thresh, "documents are considered as candidates for anchors"
    print "*** You can change this in the settings file or try to learn a model with fewer anchors"
    sys.exit()

#forms Q matrix from document-word matrix
Q = generate_Q_matrix(M)

vocab = file(vocab_file).read().strip().split()

#check that Q sum is 1 or close to it
print "Q sum is", Q.sum()
V = Q.shape[0]
print "done reading documents"

#find anchors- this step uses a random projection
#into low dimensional space

anchor_logfile = file(params.log_prefix + '.anchors', 'w')
anchors = findAnchors(Q, K, params, candidate_anchors, anchor_logfile)
print "anchors are:"
print >> anchor_logfile, "anchors are:"
    vocab = file(vocab_file).read().strip().split()
    V = M.shape[0]
    prng = RandomState(params.seed)
    R = rp.Random_Matrix(V, params.new_dim, prng)

    #only accept anchors that appear in a significant number of docs
    print "identifying candidate anchors"
    candidate_anchors = []
    for i in xrange(V):
        if len(np.nonzero(row_M[i, :])[1]) > params.anchor_thresh:
            candidate_anchors.append(i)
    print len(candidate_anchors), "candidates"

    Q = np.vstack(
        generate_Q_matrix(row_M,
                          col_M,
                          row_normalize=True,
                          projection_matrix=R.T))  #row-by-row generation
    _, anchors = gs.Projection_Find(Q, K, candidate_anchors)
    print "anchors are:", anchors
    anchor_file = file(outfile + '.anchors', 'w')
    print >> anchor_file, "\t".join(["topic id", "word id", "word"])
    for i, a in enumerate(anchors):
        print i, vocab[a]
        print >> anchor_file, "\t".join([str(x) for x in (i, a, vocab[a])])

    anchor_file.close()

    #recover topics
    row_sums = np.array(row_M.sum(1)).reshape(V)
    #generate Q_matrix rows for anchors
    Q_A = np.vstack(
Пример #7
0

    vocab = file(vocab_file).read().strip().split()
    V = M.shape[0]
    prng = RandomState(params.seed)
    R = rp.Random_Matrix(V, params.new_dim, prng)

    #only accept anchors that appear in a significant number of docs
    print "identifying candidate anchors"
    candidate_anchors = []
    for i in xrange(V):
        if len(np.nonzero(row_M[i, :])[1]) > params.anchor_thresh:
            candidate_anchors.append(i)
    print len(candidate_anchors), "candidates"

    Q = np.vstack(generate_Q_matrix(row_M, col_M, row_normalize=True, projection_matrix=R.T)) #row-by-row generation
    _, anchors = gs.Projection_Find(Q, K, candidate_anchors)
    print "anchors are:", anchors
    anchor_file = file(outfile+'.anchors', 'w')
    print >>anchor_file, "\t".join(["topic id", "word id", "word"])
    for i, a in enumerate(anchors):
        print i, vocab[a]
        print >>anchor_file, "\t".join([str(x) for x in (i,a,vocab[a])])

    anchor_file.close()

    #recover topics
    row_sums = np.array(row_M.sum(1)).reshape(V)
    #generate Q_matrix rows for anchors
    Q_A = np.vstack(generate_Q_matrix(row_M, col_M, row_normalize=True, indices=anchors, projection_matrix=None))