예제 #1
0
def get_lda(text_dictionary):
    train = []

    for key, line in text_dictionary.items():
        line = line.strip().split(' ')
        train.append(line)

    print(len(train))
    print(' '.join(train[2]))

    dictionary = corpora.Dictionary(train)
    corpus = [dictionary.doc2bow(text) for text in train]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

    topic_list = lda.print_topics(20)
    print(type(lda.print_topics(20)))
    print(len(lda.print_topics(20)))

    for topic in topic_list:
        print(topic)
    print("第一主题")
    print(lda.print_topic(1))

    print('给定一个新文档,输出其主题分布')

    # test_doc = list(new_doc) #新文档进行分词
    test_doc = train[2]  # 查看训练集中第三个样本的主题分布
    doc_bow = dictionary.doc2bow(test_doc)  # 文档转换成bow
    doc_lda = lda[doc_bow]  # 得到新文档的主题分布
    # 输出新文档的主题分布
    print(doc_lda)
    for topic in doc_lda:
        print("%s\t%f\n" % (lda.print_topic(topic[0]), topic[1]))
예제 #2
0
def chosen_lda(corpus, dictionary, data, n_topics, alpha=.1, eta=0.01):
    '''
    This function trains a Gensim LDA model on chosen hyperparameters
    
    Arguments:
    ----------
    corpus : matrix-format corpus (BOW or TF-IDF)
    dictionary : corpus-related dictionary
    data : text data for coherence score computation
    n_topics : number of desired topics
    alpha : alpha parameter (from 0 to infinity)
    eta : beta parameter (from 0 to infinity)
    
    Outputs:
    ----------
    lda : trained model
    '''
    
    lda = LdaModel(corpus=corpus, 
                id2word=dictionary, 
                num_topics=35, 
                random_state=100, 
                alpha=alpha, 
                eta=eta)
    
    ldatopics = [[word for word, prob in topic] for topicid, topic in lda.show_topics(formatted=False)]
    lda_coherence = CoherenceModel(topics=ldatopics, texts=data, dictionary=dictionary, window_size=10).get_coherence()
    print(lda_coherence)
    lda.print_topics(num_topics=n_topics)
    
    lda.save('../03_Dump/model')
    return lda
def save_model(model_path):
    train_set = get_train_set()
    # 构建训练语料
    dictionary = Dictionary(train_set)
    corpus = [dictionary.doc2bow(text) for text in train_set]

    # lda模型训练
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)
    lda.print_topics(100)

    lda.save(model_path)
예제 #4
0
def get_gensim_topics(num_topics_list, sentences, print_flag = False):
    """
    Gensim by default employs a version of count vectorization
    input: sentences (list of list of words)
    outputs coherence, perplexity, and topics 
    prints topics if print == True 
    """
    texts = sentences.apply(retokenize).tolist() 
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    perplexity_ls = []
    coherence_ls = []
    for i in num_topics_list:
        lda = LdaModel(corpus, num_topics=i, id2word = dictionary, random_state = 10)
        perplexity = lda.log_perplexity(corpus)
        perplexity_ls.append(perplexity) 
        coherence_model_lda = CoherenceModel(model = lda, texts = texts, dictionary = dictionary, coherence = 'c_v')
        coherence = coherence_model_lda.get_coherence()
        coherence_ls.append(coherence)

        if print_flag == True:
            print('Num. Topics: ', i)
            print('')
            for i in (lda.print_topics()):
                words = i[1]
                words_ls = words.split('+')
                words_ls = ([i.split('*')[1] for i in words_ls])
                words_ls = [i.replace('"', '') for i in words_ls]
                print(', '.join(words_ls))
        print('')
    return perplexity_ls, coherence_ls
def get_lda_model():
    """
    (50,28767)
    获得话题
    :return:
    """
    text_array = list()

    with open("jobs-unigrams-filter") as f:
        for line in tqdm(f):
            line = line.strip().split(" ")
            line.remove(line[0])
            text_array.append(line)

    dictionary = Dictionary(text_array)
    # print(common_dictionary)
    common_corpus = [dictionary.doc2bow(text) for text in text_array]
    # Train the model on the corpus.
    lda = LdaModel(common_corpus,
                   id2word=dictionary,
                   num_topics=50,
                   passes=10,
                   iterations=1000)
    temp_file = datapath("LDA_twitter")
    lda.save(temp_file)
    topics = lda.get_topics()
    print(topics.shape)

    topic_list = lda.print_topics(50)
    for topic in topic_list:
        print(topic)
예제 #6
0
def lda(clean_docs, model_name, topics):
    # turn all data into a dictionary mappping of normalized words and their integer ids
    from gensim import corpora
    dictionary = corpora.Dictionary(clean_docs)

    # convert each document, called text, into bag-of-words representation (list of (token_id, token_count) tuples)
    # in other words, it counts how often each word occurs in each doc of the text and saves that in the corpus
    corpus = []
    for doc in clean_docs:
        corpus.append(dictionary.doc2bow(doc))

    # serialize version: save dictionary and corpus for future use
    from gensim.corpora import MmCorpus
    MmCorpus.serialize('corpus_' + model_name + '.mm', corpus)
    dictionary.save('dictionary_' + model_name + '.gensim')

    # Train LDA model
    from gensim.models import LdaModel
    num_topics = topics  # find this number of topics in the data
    passes = 15

    ldamodel = LdaModel(corpus,
                        num_topics=num_topics,
                        id2word=dictionary,
                        passes=passes)
    ldamodel.save('model_' + model_name + '.gensim')
    topics = ldamodel.print_topics(num_words=5)

    for topic in topics:
        print(topic)
예제 #7
0
def test(key, text1):
    #comments,messages = read_data()
    #text1 = key_word(comments,messages)
    #生成字典
    dictionary = corpora.Dictionary(text1)
    #生成语料库i
    corpus = [dictionary.doc2bow(text) for text in text1]
    #tfidf加权
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    p_list = []
    topicnum_list = []
    num_topic = 2
    #for i in range(2,50):
    #    num_topics = i
    lda = LdaModel(corpus=corpus_tfidf,
                   id2word=dictionary,
                   num_topics=num_topic)
    #perplex = lda.log_perplexity(corpus_tfidf)
    #p_list.append(perplex)
    #topicnum_list.append(num_topics)
    topics = lda.print_topics()
    print '*' * 50
    print key, ':'
    for i in topics:
        str1 = str(i[0]) + ':'
        print str1, i[1].encode('utf8')
예제 #8
0
def Train(train_set):

    # stopwords = codecs.open('stopwords.txt', 'r', encoding='utf8').readlines()
    # stopwords = [w.strip() for w in stopwords]
    # train_set = []
    # for line in train:
    #     line = list(jieba.cut(line))
    #     train_set.append([w for w in line if w not in stopwords])

    # 构建训练语料
    dictionary = Dictionary(train_set)
    corpus = [dictionary.doc2bow(text) for text in train_set]

    # lda模型训练
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
    lda.print_topics(20)
예제 #9
0
def create_gensim_lda_model(dictionary, corpus, number_of_topics, words):
    # LDAモデルの作成
    ldamodel = LdaModel(corpus,
                        num_topics=number_of_topics,
                        id2word=dictionary)
    print(ldamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return ldamodel
def lda_process(split_text, embedding_size, wordvec):
    new_line, new_dict = [], []
    for line in split_text:
        for w in line.split():
            if w in stopwords: continue
            new_line.append(w)
        new_dict.append(new_line)
        new_line = []

    dictionary = Dictionary(new_dict)

    corpus = [dictionary.doc2bow(text) for text in new_dict]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=1, passes=20)

    _, title_terms = lda.print_topics(num_words=5)[0]

    title_vec = []

    sub_terms = title_terms.split('+')
    for term in sub_terms:
        listItems = term.split('*')
        try:
            title_vec.append(
                float(listItems[0]) *
                wordvec[re.findall(r'\"(.+)\"', listItems[1])[0]])
        except KeyError:
            title_vec.append(float(listItems[0]) * np.zeros(embedding_size))
    #print(wordvec[re.findall(r'\"(.+)\"',listItems[1])])

    title_vector = np.average(np.array(title_vec), axis=0)

    return title_vector.reshape(1, 300)
예제 #11
0
def LDA_model_from_token(text_file_name):
    token_file_name = text_file_name[:-4] + '.csv'
    print("loading "+token_file_name)
    data_word = []
    with codecs.open(token_file_name, 'r') as f:
        rdr = csv.reader(f)
        next(rdr)
        for i, line in enumerate(rdr):
            data_word.append(line)
        print("Complete loading")


    id2word=corpora.Dictionary(data_word)
    id2word.filter_extremes(no_below = 10) #10회 이하로 등장한 단어는 삭제
    texts = data_word
    corpus=[id2word.doc2bow(text) for text in texts]

    lda = LdaModel(corpus, num_topics=10, id2word=id2word)

    temp_file = datapath(token_file_name[:-4])
    lda.save(temp_file)

    lda = LdaModel.load(temp_file)

    topics = lda.print_topics(num_words=10)
    for topic in topics:
        print(topic)
def train(corpuspath,modelpath):
    train = []
    # stopwords = codecs.open('stopWords/1893(utf8).txt','r',encoding='utf8').readlines()
    # stopwords = [ w.strip() for w in stopwords ]
    fp = codecs.open(corpuspath, 'r', encoding='utf8')
    for line in fp:
        line = line.strip()
        if line == '':continue
        line = line.split()
        train.append([w for w in line])

    dictionary = corpora.Dictionary(train)
    corpus = [dictionary.doc2bow(text) for text in train]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=20)
    lda.save(modelpath)

    topic_words = open('../result/lda/fact_lad_print-10.txt','w',encoding='utf-8')
    print_str = ''
    for topic in lda.print_topics(num_words=100):
        termNumber = topic[0]
        listOfTerms = topic[1].split('+')
        # for term in listOfTerms:
        #     listItems = term.split('*')
        #     # print(listItems)
        #     print('  ', listItems[1], '(', listItems[0], ')', sep='')
        print_str += topic[1] + '\n'
    topic_words.write(print_str)
    topic_words.close()
예제 #13
0
def build_lda_model(data_name):
    # load the training data
    train_data = np.load(open('./dict/' + data_name + '.npy', 'rb'))
    # load the dictionary
    dictionary = pickle.load(open('./dict/' + data_name + '.pkl', 'rb'))
    lda = LdaModel(train_data,
                   id2word=dictionary,
                   num_topics=20,
                   passes=2,
                   alpha='symmetric',
                   eta=None)

    lda.print_topics(num_topics=20, num_words=10)

    # save the model
    lda.save('./lda/' + data_name + '.model')
예제 #14
0
def main():
    sentence_list = load_data('E:\\tmp\\csv_test')
    stop_words = load_stop_word('dependencies/stop_word.txt')
    word_split = participle(sentence_list, stop_words)
    dictionary = Dictionary(word_split)
    corpus = [dictionary.doc2bow(text) for text in word_split]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)
    print(lda.print_topics(10))
예제 #15
0
파일: lda.py 프로젝트: pengm-hub/HIN
def save_ldamodel(dictionary, text_data, cnt_cata):

    corpus = [dictionary.doc2bow(text) for text in text_data]
    ldamodel = LdaModel(corpus, num_topics=cnt_cata, id2word=dictionary)
    # 查看主题
    for topic in ldamodel.print_topics():
        print(topic[1])
    ldamodel.save('model/{}/ADA.gensim'.format(cnt_cata), "wb")
예제 #16
0
 def LDA(self, topics):
     # convert the vectorized data to a gensim corpus object
     corpus = gensim.matutils.Sparse2Corpus(self.corpusVectorized, documents_columns=False)
     # maintain a dictionary for index-word mapping
     id2word = dict((v, k) for k, v in self.vectorizer.vocabulary_.iteritems())
     print id2word
     # build the lda model
     lda = LdaModel(corpus, num_topics=topics, id2word=id2word, passes=10)
     print lda.print_topics()
     lda_docs = lda[corpus]
     for row in lda_docs:
         print row
     scores = np.round([[doc[1] for doc in row] for row in lda_docs], 3)
     print scores
     cols=[]
     for i in range(topics):
         cols.append("topic "+str(i))
     df_lda = pd.DataFrame(scores, columns=cols)
     df_lda
예제 #17
0
class TopicModel(object):
    def dataPreprocess(self,path):
        self.preprocess=Preprocess()
        self.preprocess.reader(path)

    def train(self):
        self.lda=LdaModel(self.preprocess.corpus,id2word=self.preprocess.dictionary,num_topics=10)
        for topic in self.lda.print_topics(num_topics=10,num_words=10):
            print(topic[1])

    def evaluation(self):
        pass
예제 #18
0
파일: news.py 프로젝트: xialei/poc
def lda():
    # remove stop words
    stopwords = codecs.open('../conf/stop_words_ch.txt', mode='r', encoding='utf8').readlines()
    stopwords = [ w.strip() for w in stopwords ]
    
    fp = codecs.open('D:\\nlp\corpora\segs.txt', mode='r', encoding='utf8')
    train = []
    for line in fp:
        line = line.split()
        train.append([ w for w in line if w not in stopwords ])
    
    dictionary = corpora.Dictionary(train)
    corpus = [ dictionary.doc2bow(text) for text in train ]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)
    
    lda.print_topics(30)
    # print topic id=20
    lda.print_topic(20)
    
    # save/load model
    lda.save('D:\\nlp\corpora\news.model')
예제 #19
0
class LDAModelGensim:
    '''
    Creates a LDA model using Gensim's LdaModel class.
    '''
    def __init__(self, sentences=None, num_topics=2):
        self.converter = CorpusConverter()
        self.corpus, self.id2word = self.converter.convert(sentences)
        self.num_topics = num_topics
        self.lda_model = LdaModel(self.corpus, self.num_topics, self.id2word)

    def get_model_topics(self):
        return self.lda_model.print_topics(-1)
예제 #20
0
def get_topic_words(sent, stop_words, cnt=15):
    sent = re.sub(r'[\r\n]', '', sent)
    wlst = jieba.lcut(sent)
    ls = []
    for w in wlst:
        if w not in stop_words:
            ls.append(w)

    di = Dictionary([ls])
    corpus = [di.doc2bow(text) for text in [ls]]
    lda = LdaModel(corpus, id2word=di, num_topics=1)
    tp = lda.print_topics(num_words=cnt)[0][1]
    return re.findall('"(.+?)"', tp)
예제 #21
0
def my_lda_learn(topic):
    text = []
    with open(filtered_data_file, 'r', encoding="UTF-8") as f:
        for line in f.readlines():
            text.append(line.split())
    dictionary = Dictionary(text)

    text2bow = [dictionary.doc2bow(one_text) for one_text in text]

    my_lda = LdaModel(
        text2bow, id2word=dictionary, num_topics=topic, passes=20)

    print(my_lda.print_topics(num_topics=topic, num_words=10))
예제 #22
0
def explore(parameters, run):
    print(parameters)
    no_above = parameters["no_above"]
    chunksize = parameters["chunksize"]
    passes = parameters["passes"]
    iterations = parameters["iterations"]
    size = parameters["size"]
    num_topics = parameters["num_topics"]

    with open(fname, 'a', newline='', encoding='utf-8') as csv_file:
        run += 1
        print("Run " + str(run) + " out of " + str(runs))
        writer = csv.writer(csv_file)
        corpora.Dictionary.filter_extremes(dictionary,
                                           no_below=no_below,
                                           no_above=no_above,
                                           keep_tokens=None)
        corpus = [dictionary.doc2bow(review) for review in reviews]
        corpora.MmCorpus.serialize(name + '.mm', corpus)
        mm = corpora.MmCorpus(
            name + '.mm')  # `mm` document stream now has random access
        mm_used = mm[:size]
        writer.writerows([[
            "Data size", "Topics", "no_above", "Chunksize", "Passes",
            "Iteration"
        ], [size, num_topics, no_above, chunksize, passes, iterations], []])

        lda = LdaModel(mm_used,
                       num_topics=num_topics,
                       chunksize=chunksize,
                       id2word=dictionary,
                       passes=passes,
                       iterations=iterations,
                       eval_every=eval_every)

        lst = []
        for topic in LdaModel.print_topics(lda, -1, 10):
            terms = [
                x[0] for x in LdaModel.get_topic_terms(lda, topic[0], topn=10)
            ]
            term_strings = [str(dictionary[term]) for term in terms]
            str_topic = []
            str_topic.append("Topic " + str(topic[0] + 1))
            str_topic.extend(term_strings)
            lst.append(str_topic)

        writer.writerows(zip(*lst))
        writer.writerow([])

        return run
예제 #23
0
def lda():
    # remove stop words
    stopwords = codecs.open('../conf/stop_words_ch.txt',
                            mode='r',
                            encoding='utf8').readlines()
    stopwords = [w.strip() for w in stopwords]

    fp = codecs.open('D:\\nlp\corpora\segs.txt', mode='r', encoding='utf8')
    train = []
    for line in fp:
        line = line.split()
        train.append([w for w in line if w not in stopwords])

    dictionary = corpora.Dictionary(train)
    corpus = [dictionary.doc2bow(text) for text in train]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)

    lda.print_topics(30)
    # print topic id=20
    lda.print_topic(20)

    # save/load model
    lda.save('D:\\nlp\corpora\news.model')
예제 #24
0
def perform_lda_iterations(num_topics, num_passes):
    """Performs LDA on up to a specified number of topics with a specified number of passes."""
    tw = joblib.load('../data/clean/tweets-series.pkl')
    rm = joblib.load('../data/clean/remarks-series.pkl')

    rm = rm.apply(__unlist)
    tcv = CountVectorizer(stop_words='english',
                          token_pattern="\\b[a-z][a-z]+\\b")
    tdm = tcv.fit_transform(tw).transpose()

    rcv = CountVectorizer(stop_words='english',
                          token_pattern="\\b[a-z][a-z]+\\b")
    rdm = rcv.fit_transform(rm).transpose()

    tc = matutils.Sparse2Corpus(tdm)
    rc = matutils.Sparse2Corpus(rdm)

    tid2word = dict((v, k) for k, v in tcv.vocabulary_.items())
    rid2word = dict((v, k) for k, v in rcv.vocabulary_.items())

    for i in range(2, 20):
        tlda = LdaModel(corpus=tc,
                        num_topics=i,
                        minimum_probability=0.03,
                        id2word=tid2word,
                        passes=20)
        print('Modeled topics at ', i)
        print(tlda.print_topics())

    for i in range(2, 20):
        rlda = LdaModel(corpus=rc,
                        num_topics=i,
                        minimum_probability=0.03,
                        id2word=rid2word,
                        passes=20)
        print('Modeled topics at ', i)
        print(rlda.print_topics())
예제 #25
0
    def make_lda_model(self, num_topics=11):
        '''
        Build an optimized LDA model.
        prints a coherence score for sanity checking (EDA has revealed the target coherence to be ~0.39)
        '''
        print('  - Building LDA Model model with {} topics'.format(num_topics))

        dictionary = corpora.Dictionary(self.token_list)
        corpus = [dictionary.doc2bow(text) for text in self.token_list]

        #set up mallet path
        # os.environ.update({'MALLET_HOME':r'anaconda3/lib/python3.7/site-packages/mallet-2.0.8/'})
        # mallet_path = '/anaconda3/lib/python3.7/site-packages/mallet-2.0.8/bin/mallet' # update this path
        #
        # #Make Model:
        # ldamallet = LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary)
        ldamodel = LdaModel(corpus,
                            num_topics=num_topics,
                            id2word=dictionary,
                            passes=20)
        #Get Coherence Score:
        coherence_score = CoherenceModel(model=ldamodel,
                                         texts=self.token_list,
                                         dictionary=dictionary,
                                         coherence='c_v').get_coherence()
        # model_topics = optimal_model.show_topics(formatted=False)

        # print topics
        pp.pprint(ldamodel.print_topics(num_words=6))
        print("  - Num Topics: {}. Coherence Value of: {:2.3f}".format(
            num_topics, coherence_score))

        self.all_topics = ldamodel.print_topics(num_words=6)
        self.ldamodel = ldamodel
        self.corpus = corpus
        self.dictionary = dictionary
        self.coherence_score = coherence_score
def lda_model(dictionary, corpus, corpus_tfidf, cluster_keyword_lda):  # 使用lda模型,获取主题分布
    lda = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=20)
    f_keyword = open(cluster_keyword_lda, 'w+',encoding='utf-8')
    for topic in lda.print_topics(20, 20):
        print('****' * 5)
        words = []
        for word in topic[1].split('+'):
            word = word.split('*')[1].replace(' ', '')
            words.append(word)
        f_keyword.write(str(topic[0]) + '\t' + ','.join(words) + '\n')
    # 利用lsi模型,对文本进行向量表示,这相当于与tfidf文档向量表示进行了降维,维度大小是设定的主题数目
    corpus_lda = lda[corpus_tfidf]
    for doc in corpus_lda:
        print(len(doc), doc)
    return lda
예제 #27
0
def lda_output(text, train):
    line_list = text.split(" ")
    train.append([w for w in line_list if w not in stopwords])

    dictionary = corpora.Dictionary(train)
    corpus = [dictionary.doc2bow(train_text) for train_text in train]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=topic_num)
    lda.save('news_lda.model')

    # 打印前20个topic的词分布
    topics = lda.print_topics(topic_num)

    # 打印id为20的topic的词分布
    for i in range(topic_num):
        print(str(i) + ":" + topics[i][1])
    print("===============================")
예제 #28
0
def lda_model(cut_df, num_topics=10, top_words=5, show=True):
    te = cut_df['cut_doc'].values
    dictionary = gensim.corpora.Dictionary(te)
    corpus = [dictionary.doc2bow(text) for text in te]
    #corpus -> tfidf
    tfidf = TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    #tfidf -> lda
    lda = LdaModel(corpus=corpus_tfidf,
                   id2word=dictionary,
                   num_topics=num_topics)
    corpus_lda = lda[corpus]
    if show:
        topics = lda.print_topics(num_topics, top_words)
        for toc in topics:
            print(toc)
    return lda
예제 #29
0
def lda_model(sentence_dict, dictionary, corpus, corpus_tfidf, cluster_keyword_lda, target_lt, num_cluster=11):
    '''使用lda模型,获取主题分布'''
    lda = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_cluster)
    f_keyword = open(cluster_keyword_lda, 'w+')
    for topic in lda.print_topics(num_cluster, 53):
        #print('***************************')
        words=[]
        for word in topic[1].split('+'):
            word = word.split('*')[1].replace(' ','')
            words.append(word)
        f_keyword.write(str(topic[0])+'\t'+','.join(words)+'\n')

    #利用lda模型,对文本进行向量表示,这相当于与tfidf文档向量表示进行了降维,维度大小是设定的主题数目  
    corpus_lda = lda[corpus_tfidf]
    write_results("./results_lda.txt", corpus_lda, target_lt)

    return lda
예제 #30
0
파일: lda.py 프로젝트: AyaRamazanova/LDA
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-dir',
                        default='./data/test_arxiv_plain.txt',
                        help='Path to directory where the data is stored')
    parser.add_argument('--model-dir',
                        default='../model',
                        help='Path to directory where the model is stored')
    parser.add_argument('--train',
                        default=True,
                        help='True for train, False for test mode')
    parser.add_argument('--n_topic', default=20, help='Number of of topics')
    args = parser.parse_args()
    model_dir = './model/model'
    dict_dir = './model/dict.txt'

    if args.train == True:
        print('Reading texts')
        with open(args.data_dir) as f_in:
            texts = f_in.read().split('\n')
        del texts[-1]
        for i in tqdm(range(len(texts))):
            texts[i] = texts[i].split()

        print('Generating corpora')
        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        dictionary.save_as_text(dict_dir)

        print('Loading model')
        lda = LdaModel(corpus, num_topics=args.n_topic)
        lda.save(model_dir)
    else:
        lda = LdaModel.load(model_dir, mmap='r')
        dictionary = Dictionary()
        dictionary.load_from_text(dict_dir)

    print('Processing results')
    topics = lda.print_topics()
    with open('./report.txt', 'w') as f_out:
        for topic_id, topic_pair in topics:
            print(topic_id, end=': ', file=f_out)
            topic_words = topic_pair.split('"')[1::2]
            topic_words = list(map(int, topic_words))
            topic_words = [dictionary.get(word) for word in topic_words]
            print(topic_words, file=f_out)
예제 #31
0
def perform_lda(doc_set, num_topics=64):
    print('lda started')
    tokenizer = RegexpTokenizer(r'\w+')
    raw = " ".join([doc.lower() for doc in doc_set])
    tokens = tokenizer.tokenize(raw)

    en_stop = get_stop_words('en')
    stopped_tokens = [i for i in tokens if not i in en_stop]
    # p_stemmer = PorterStemmer()
    # texts = [p_stemmer.stem(i) for i in stopped_tokens]
    texts = stopped_tokens
    dictionary = corpora.Dictionary([stopped_tokens])
    corpus = [dictionary.doc2bow(text.split()) for text in texts]
    ldamodel = LdaModel(corpus, num_topics=64, id2word=dictionary, passes=20)

    for line in ldamodel.print_topics(num_topics=num_topics, num_words=10):
        print('\t', line)
예제 #32
0
    # stemming process
    print(count)
    # print(List)
    # counts = Counter(List)
    # print(counts)
    print(documentInfo)
    train_set = documentInfo

    # construct training corpus
    dictionary = Dictionary(train_set)
    corpus = [dictionary.doc2bow(text) for text in train_set]
    print(corpus)
    print(dictionary)
    # train lda model
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=30)
    print(lda)
    print(lda.print_topics(5))


    #
    # def lda_test(train_set):
    #     # train corpus
    #     dictionary = Dictionary(train_set)
    #     corpus = [dictionary.doc2bow(text) for text in train_set]
    #     print(corpus)
    #     print(dictionary)
    #     # lda model training
    #     lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50)
    #     print(lda)
    #     return (lda.print_topics(50))
예제 #33
0
print 'Saving dictionary (%s)...' % DICT
dictionary.save(DICT)

print 'Building bag-of-words corpus ...'
bow_corpus = [ dictionary.doc2bow(t) for t in texts ]

print 'Serializing corpus (%s) ...' % BOW
MmCorpus.serialize(BOW, bow_corpus)

size = len(bow_corpus) * 4 / 5
training = bow_corpus[:size]
testing = bow_corpus[size:]

print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training))
lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5, iterations = 1000)

print 'Saving LDA model (%s) ...' % NSFLDA
lda.save(NSFLDA)

print 'Random subset of topics:'
print '\n'.join(lda.print_topics())

print 'Computing perplexity on %d held-out documents ...' % len(testing)
perplexity = 2 ** -(lda.log_perplexity(testing))
print 'Perplexity: %.2f' % perplexity




        #
        # logging.info('combine report and wiki dictionary...')
        # wiki_to_report = report_dict.merge_with(wiki_dict)
        # merged_dict = report_dict
        #
        # logging.info('combine report and wiki corpus...')
        # merged_corpus = wiki_to_report[wiki_corpus].corpus + report_corpus
        logging.info('generate wiki corpus...')
        wiki_txt = unpickle('data/txt/processed_wiki.pkl')
        wiki_corpus = [report_dict.doc2bow(wiki) for wiki in wiki_txt]

        logging.info('combine report and wiki corpus...')
        merged_corpus = wiki_corpus + report_corpus

    # compute TFIDF
    # logging.info('compute TFIDF...')
    # tfidf = TfidfModel(dictionary=report_dict, id2word=report_dict)

    # perform LDA
    logging.info('perform LDA...')
    if use_wiki is True:
        lda = LdaModel(corpus=merged_corpus, id2word=report_dict, num_topics=num_topics, passes=passes,
                       iterations=iterations, alpha='auto', chunksize=chunksize)
        lda.save('result/model_wiki.lda')
        lda.print_topics(topics=num_topics, topn=10)
    else:
        lda = LdaModel(corpus=report_corpus, id2word=report_dict, num_topics=num_topics, passes=passes,
                       iterations=iterations, alpha='auto', chunksize=chunksize)
        lda.save('result/model.lda')
        lda.print_topics(topics=num_topics, topn=10)