def main(): logging.basicConfig(format='[%(levelname)s] : %(message)s', level=logging.INFO) test_data = getTestData('input/test.txt') tags = getSingleKeywords(test_data, 3) print("\nkey word: \n") print(" ".join(tags)) print("\n") init_stopword() seg_list = getSingleSegment(test_data) seg_string = " ".join(seg_list) wc = generateWordCloud(seg_string) """
def main(): logging.basicConfig(format='[%(levelname)s] : %(message)s', level=logging.INFO) init_stopword() article_lists = getArticle("fb_fetch_article") seg_list = getSegment(article_lists) #print(seg_list) #'FGO','fgo','少女前線','白貓','寫真','cosplay','東方','演唱會','百合','艦娘','血小板','碧藍','偶像','音樂' train_list = [] train_id = [] idx = 0 for doc in seg_list: for word in doc: if word in [ 'FGO', 'fgo', 'Fate', '少女前線', '白貓', '寫真', 'cosplay', 'cos', '東方', '演唱會', '百合', '艦娘', '血小板', '碧藍', '音樂', '工作細胞' ]: train_list.append(doc) train_id.append(idx) break idx += 1 #print(train_list) #print(train_id) filt_list = [[ y for y in x if y in { 'FGO', 'fgo', 'Fate', '少女前線', '白貓', '寫真', 'cosplay', 'cos', '東方', '演唱會', '百合', '艦娘', '血小板', '碧藍', '音樂', '工作細胞' } ] for x in train_list] #print(filt_list) with open("output/fb.train_id", "wb") as fp: pickle.dump(train_id, fp) n = 20 # 移除只出現n次的字詞 from collections import defaultdict frequency = defaultdict(int) for text in filt_list: for token in text: frequency[token] += 1 """ sorted_frequency = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True) for word in sorted_frequency: if word[1] >= 30: print(word[0]+":"+str(word[1])) """ texts = [[token for token in text if frequency[token] >= n] for text in filt_list] # Create dictionary dictionary = corpora.Dictionary(texts) dictionary.compactify() dictionary.save("output/fb.dict") logging.info("Create dict success.") # Serialize it corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize("output/fb.mm", corpus) logging.info("Create data flow success.") tfidf = models.TfidfModel(corpus) tfidf.save("output/fb.tfidf") corpus_tfidf = tfidf[corpus] logging.info("Create TF-IDF model success.") num_topic = 9 # Transfer to LSI model lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topic, iterations=100, passes=20) lda.save("output/fb.lda") logging.info("Create LDA model success.") for i in range(lda.num_topics): print(lda.print_topic(i)) print('\n') """
def main(): logging.basicConfig(format='[%(levelname)s] : %(message)s', level=logging.INFO) if len(sys.argv) < 2: logging.error("No argument") logging.info("Usage: $python train_ff.py [activity]") logging.info("FF31~32 CWT48 49 PF26~28, etc.") sys.exit() datefrom = "" dateto = "" if sys.argv[1] == "FF31": datefrom = "\"2017-12-01\"" dateto = "\"2018-04-14\"" elif sys.argv[1] == "FF32": datefrom = "\"2018-04-15\"" dateto = "\"2018-11-11\"" elif sys.argv[1] == "CWT49": datefrom = "\"2018-04-01\"" dateto = "\"2018-08-20\"" elif sys.argv[1] == "CWT48": datefrom = "\"2017-12-15\"" dateto = "\"2018-03-31\"" elif sys.argv[1] == "PF26": datefrom = "\"2016-11-15\"" dateto = "\"2017-04-29\"" elif sys.argv[1] == "PF27": datefrom = "\"2017-04-30\"" dateto = "\"2018-10-30\"" elif sys.argv[1] == "PF28": datefrom = "\"2017-10-31\"" dateto = "\"2018-05-28\"" init_stopword() article_lists = getArticleByTime("fb_fetch_article", datefrom, dateto) #article_lists = getArticle("fb_fetch_article") seg_list = getSegment(article_lists) #print(seg_list) #'FGO','fgo','少女前線','白貓','寫真','cosplay','東方','演唱會','百合','艦娘','血小板','碧藍','偶像','音樂' train_list = [] train_id = [] idx = 0 for doc in seg_list: for word in doc: if word in [ 'PF', 'FF', 'FGO', 'fgo', 'Fate', '少女前線', '白貓', '寫真', 'cosplay', 'cos', '東方', '演唱會', '百合', '艦娘', '血小板', '碧藍', '音樂', '工作細胞' ]: train_list.append(doc) train_id.append(idx) break idx += 1 #print("-----????---\n",train_list) #print(train_id) filt_list = [[ y for y in x if y in [ 'PF', 'FF', 'FGO', 'fgo', 'Fate', '少女前線', '白貓', '寫真', 'cosplay', 'cos', '東方', '演唱會', '百合', '艦娘', '血小板', '碧藍', '音樂', '工作細胞' ] ] for x in train_list] #print("-----????---\n",filt_list) with open("output/fb_train_id", "wb") as fp: pickle.dump(train_id, fp) n = 20 # 移除只出現n次的字詞 from collections import defaultdict frequency = defaultdict(int) for text in filt_list: for token in text: frequency[token] += 1 """ sorted_frequency = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True) for word in sorted_frequency: if word[1] >= 30: print(word[0]+":"+str(word[1])) """ texts = [[token for token in text if frequency[token] >= n] for text in filt_list] # Create dictionary dictionary = corpora.Dictionary(texts) dictionary.compactify() dictionary.save("output/fb_" + sys.argv[1] + ".dict") logging.info("Create dict success.") # Serialize it corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize("output/fb_" + sys.argv[1] + ".mm", corpus) logging.info("Create data flow success.") tfidf = models.TfidfModel(corpus) tfidf.save("output/fb_" + sys.argv[1] + ".tfidf") corpus_tfidf = tfidf[corpus] logging.info("Create TF-IDF model success.") num_topic = 9 # Transfer to LSI model lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topic, iterations=100, passes=20) lda.save("output/fb_" + sys.argv[1] + ".lda") logging.info("Create LDA model success.") for i in range(lda.num_topics): print(lda.print_topic(i)) print('\n') """
def main(): logging.basicConfig(format='[%(levelname)s] : %(message)s', level=logging.INFO) if (os.path.exists("output/0814.dict")): dictionary = corpora.Dictionary.load("output/0814.dict") corpus = corpora.MmCorpus("output/0814.mm") logging.info("Load model success") else: logging.info("Please run the train2.py to create dict & data flow") # Create tf-idf model tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # Transfer to LSI model lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=40) corpus_lsi = lsi[corpus_tfidf] # LSI潛在語義索引 lsi.save('output/0814.lsi') corpora.MmCorpus.serialize('output/0814_lsi.mm', corpus_lsi) """ print("LSI topics:") results = lsi.print_topics(5) for result in results: print(result) """ # test_data = '' # with open('input/test.txt', 'r', encoding='utf-8') as f: # for line in f: # words = jieba.cut(line) # test_data += ' '.join(words) # # print(test_data.split()) test_data = [] init_stopword() test_data = getTestData('input/test.txt') test_data_seg = getSingleSegment(test_data) vec_bow = dictionary.doc2bow(test_data_seg) vec_lsi = lsi[vec_bow] print("\nAriticle:\n%s" % test_data) # Create index index = similarities.MatrixSimilarity(lsi[corpus]) index.save("output/0814.index") # Similarity sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) print("result:") print(sims[:5]) # Print results articles = getArticle() for idx in sims[:3]: print("\nSimilar Ariticle:\n", articles[idx[0]]) print("\nSimilarity:", idx[1])
def main(): logging.basicConfig(format='[%(levelname)s] : %(message)s', level=logging.INFO) if len(sys.argv) < 2: logging.error("No argument") logging.info("Usage: $python train_news.py [month]") logging.info("Jan for January, Feb for February, etc.") sys.exit() datefrom = "" dateto = "" if sys.argv[1] == "Sep": datefrom = "\"2018-09-01\"" dateto = "\"2018-09-31\"" elif sys.argv[1] == "Oct": datefrom = "\"2018-10-01\"" dateto = "\"2018-10-30\"" elif sys.argv[1] == "Nov": datefrom = "\"2018-11-01\"" dateto = "\"2018-11-31\"" elif sys.argv[1] == "Sep_e": datefrom = "\"2018-09-01\"" dateto = "\"2018-09-15\"" elif sys.argv[1] == "Sep_l": datefrom = "\"2018-09-16\"" dateto = "\"2018-09-30\"" elif sys.argv[1] == "Oct_e": datefrom = "\"2018-10-01\"" dateto = "\"2018-10-15\"" elif sys.argv[1] == "Oct_l": datefrom = "\"2018-10-16\"" dateto = "\"2018-10-30\"" elif sys.argv[1] == "Nov_e": datefrom = "\"2018-11-01\"" dateto = "\"2018-11-15\"" init_stopword() article_lists = getArticleByTime("media_fetch_news", datefrom, dateto) if not article_lists: logging.error("No news data in the month, please choose another") sys.exit() seg_list = getSegment(article_lists) minn = 30 maxn = 200 # 移除只出現n次的字詞 from collections import defaultdict frequency = defaultdict(int) for text in seg_list: for token in text: frequency[token] += 1 """ sorted_frequency = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True) for word in sorted_frequency: if word[1] > 30 and word[1] < 200: print(word[0]+":"+str(word[1])) """ texts = [[ token for token in text if frequency[token] > minn and frequency[token] < maxn ] for text in seg_list] dictname = "news_" + sys.argv[1] + ".dict" mmname = "news_" + sys.argv[1] + ".mm" tfidfname = "news_" + sys.argv[1] + ".tfidf" ldaname = "news_" + sys.argv[1] + ".lda" # Create dictionary dictionary = corpora.Dictionary(texts) dictionary.compactify() dictionary.save("output/" + dictname) logging.info("Create dict success.") # Serialize it corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize("output/" + mmname, corpus) logging.info("Create data flow success.") tfidf = models.TfidfModel(corpus) tfidf.save("output/" + tfidfname) corpus_tfidf = tfidf[corpus] logging.info("Create TF-IDF model success.") num_topic = 6 # Transfer to LSI model lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topic, iterations=100, passes=20) lda.save("output/" + ldaname) logging.info("Create LDA model success.") for i in range(lda.num_topics): print(lda.print_topic(i)) print('\n')