def test4(): text = TextExtractor.read_all_text(os.path.join(constants.DATA_PATH, 'author_topic_filtered', '1002.txt')).split(',') text2 = TextExtractor.read_all_text(os.path.join(constants.DATA_PATH, 'author_topic_filtered', '1004.txt')).split(',') docs = [] docs.append(text) docs.append(text2) tf_idf, corpus, dictionary = train_tf_idf(docs) plot_word_cloud(tf_idf, corpus, dictionary, 'filtered_100_wordcloud')
def test(): docs = [] doc1 = TextExtractor.read_all_text(os.path.join(constants.TEST_OUTPUT, "10.txt")).split(",") doc2 = TextExtractor.read_all_text(os.path.join(constants.TEST_OUTPUT, "100.txt")).split(",") doc3 = TextExtractor.read_all_text(os.path.join(constants.TEST_OUTPUT, "101.txt")).split(",") docs.append(doc1) docs.append(doc2) docs.append(doc3) tf_idf, corpus, dictionary = train_tf_idf(docs) plot_word_cloud(tf_idf, corpus, dictionary)
def test2(): docs = [] count = 0 for fileName in os.listdir(os.path.join(constants.DATA_PATH,'author_topic_filtered')): if count >= 0: doc = TextExtractor.read_all_text(os.path.join(constants.DATA_PATH,'author_topic_filtered', fileName)).split(",") docs.append(doc) if count >= 600:break print count count += 1 tf_idf, corpus, dictionary = train_tf_idf(docs) plot_word_cloud(tf_idf, corpus, dictionary, 'filtered_wordcloud')
def test5(): # text1 = TextExtractor.read_all_text(os.path.join(constants.DATA_PATH, 'authors_topic', '1002.txt')) # tokens = Tokenizer.get_tokens(text1) # create_word_cloud_from_words(tokens, "Initial Tokens") # # tokens = Tokenizer.get_stopped_tokens(text1) # create_word_cloud_from_words(tokens, "Stop Words Removed") # tokens = Tokenizer.get_stemmed_tokens(text1) # create_word_cloud_from_words(tokens, "Stemmed Tokens") # tokens = TokenFixer.fix_broken_words(tokens) # nps = NLPParser.get_noun_phrases(text1) # create_word_cloud_from_words(nps, "Noun Phrases") tokens0 = TextExtractor.read_all_text(os.path.join(constants.DATA_PATH, 'author_topic_filtered', '1002.txt')).split(',') create_word_cloud_from_words(tokens0, "Topics")