Exemplo n.º 1
0
def rtTopics(dire, direo, topicnum, topicwordsnum, iternum=1000):
    """
    Return Topics. dire is the path of the filepath, topicnum is the
    number of topics, topicwordsnum is the number of words in every topic.
    """
    model = lda.LDA(n_topics=topicnum, n_iter=iternum, random_state=1)
    for filename in os.listdir(dire):
        vocab = ()
        wordsdic = {}
        filenum = 0
        filename = filename.decode('gbk').encode('utf-8')
        fdir = dire + filename
        SegResults = seg.rtSegResult(fdir, True, True)
        seg.FileSegResult(direo + 'words' + str(filenum + 1) + '.txt',
                          SegResults)
        seg.PlotWordCloud(
            SegResults, 'E:/Other/USTB/nlp/WordCloud/tag_cloud' +
            str(filenum + 1) + '.jpg')
        for s in SegResults:
            sf = s[0].encode('utf-8')
            if wordsdic.has_key(sf):
                tmpdict = wordsdic[sf].copy()
                tmpdict[filenum] = s[1]
                wordsdic[sf] = tmpdict.copy()
            else:
                tmpdict = {}
                tmpdict[filenum] = s[1]
                wordsdic[sf] = tmpdict.copy()
        filenum += 1
        vocab = tuple(wordsdic.keys())
        X = zeros((filenum, len(vocab)), dtype=int)
        wordnum = 0
        for w in wordsdic.keys():
            tmpdict = wordsdic[w].copy()
            for t in tmpdict.keys():
                X[t][wordnum] = tmpdict[t]
            wordnum += 1
        print X.sum()
        model.fit(X)
        topic_word = model.topic_word_
        n_top_words = topicwordsnum
        fileout = codecs.open(direo + 'topics' + str(filenum) + '.txt', 'w',
                              'utf-8')
        for i, topic_dist in enumerate(topic_word):
            topic_words = array(vocab)[argsort(topic_dist)][:-(n_top_words +
                                                               1):-1]
            fileout.write(('Topic {}: {}'.format(i, '+'.join(topic_words)) +
                           '\n').decode('utf-8'))
        fileout.close()
Exemplo n.º 2
0
# coding=utf-8
'''
Created on Sep. 29, 2015

@author: LuDan
'''
import Segmentation as seg

fdir = 'E:/Other/USTB/nlp/Text1.txt'
frfile = True
ofile = 'text.txt'

SegResults = seg.rtSegResult(fdir, frfile, True)
#get the segmentation results
seg.FileSegResult(ofile, SegResults)
#put the segmentation results to a file
seg.PlotWordCloud(SegResults, 'Ttag_cloud.jpg', 30, 20, 80)
#plot the wordcloud