ARTICLE_NAME = ["Computer","Entertainment","Sports","Science","Economy","World",'Politics','Society'] ARTICLE = {"Computer":"","Entertainment":"","Sports":"","Science":"","Economy":"","World":"","Politics":"","Society":""} ret=[] data_train = [[] for row in range(8)] train_num = 0 train_sum = 0 for n in ARTICLE_NAME: data_range = countline('/Users/Soma/Onedrive/News_Dataset/article'+n+'.txt') train_sum += data_range for col in range(data_range): data_train[train_num].append(None) train_num += 1 label_train = [None for col in range(train_sum)] article_data = [None for col in range(train_sum)] num =0 num3 =0 dictionary = corpora.Dictionary.load_from_text('test_dic4.txt') M.pp(M.isMecab("香川ループ弾「衝撃」と賞賛")) test=[[]for col in range(6)] test[0].append("a") test[0].append("b") test[0].append("c") test[1].append("d") test[1].append("e") print (test)
"Sports": "", "Science": "", "Economy": "", "World": "", "Politics": "", "Society": "", } dictionary = corpora.Dictionary.load_from_text("noun_dic.txt") for n in ARTICLE_NAME: print "\n" + n + " LDA modl cleate..\n" f = codecs.open("/Users/Soma/Onedrive/News_Dataset/article" + n + ".txt", "r") ARTICLE[n] = f.readlines() f.close() data_train = [dictionary.doc2bow(M.isMecab(j)) for j in ARTICLE[n]] tfidf_corpus = gensim.corpora.MmCorpus("news_noun_" + n + ".mm") lda = models.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=30) lda.save("model_" + n + ".lda") if __name__ == "__main__": print "cleate_dic.." # CD.cleate_dic() cleate_lda_model() lda = models.LdaModel.load("model_Sports.lda") lda2 = models.LdaModel.load("model_Computer.lda") for n in range(30): M.pp(lda.print_topics(n + 1)) for n in range(30): M.pp(lda2.print_topics(n + 1))