Пример #1
0
def test4():
    text = TextExtractor.read_all_text(os.path.join(constants.DATA_PATH, 'author_topic_filtered', '1002.txt')).split(',')
    text2 = TextExtractor.read_all_text(os.path.join(constants.DATA_PATH, 'author_topic_filtered', '1004.txt')).split(',')
    docs = []
    docs.append(text)
    docs.append(text2)
    tf_idf, corpus, dictionary = train_tf_idf(docs)
    plot_word_cloud(tf_idf, corpus, dictionary, 'filtered_100_wordcloud')
Пример #2
0
def test():
    docs = []
    doc1 =  TextExtractor.read_all_text(os.path.join(constants.TEST_OUTPUT, "10.txt")).split(",")
    doc2 =  TextExtractor.read_all_text(os.path.join(constants.TEST_OUTPUT, "100.txt")).split(",")
    doc3 =  TextExtractor.read_all_text(os.path.join(constants.TEST_OUTPUT, "101.txt")).split(",")

    docs.append(doc1)
    docs.append(doc2)
    docs.append(doc3)
    tf_idf, corpus, dictionary = train_tf_idf(docs)
    plot_word_cloud(tf_idf, corpus, dictionary)
Пример #3
0
def test2():
    docs =  []
    count = 0
    for fileName in os.listdir(os.path.join(constants.DATA_PATH,'author_topic_filtered')):
        if count >= 0:
            doc =  TextExtractor.read_all_text(os.path.join(constants.DATA_PATH,'author_topic_filtered', fileName)).split(",")
            docs.append(doc)
        if count >= 600:break
        print count
        count += 1
    tf_idf, corpus, dictionary = train_tf_idf(docs)
    plot_word_cloud(tf_idf, corpus, dictionary, 'filtered_wordcloud')
Пример #4
0
def test5():
    # text1 = TextExtractor.read_all_text(os.path.join(constants.DATA_PATH, 'authors_topic', '1002.txt'))
    # tokens = Tokenizer.get_tokens(text1)
    # create_word_cloud_from_words(tokens, "Initial Tokens")
    #
    # tokens = Tokenizer.get_stopped_tokens(text1)
    # create_word_cloud_from_words(tokens, "Stop Words Removed")

    # tokens = Tokenizer.get_stemmed_tokens(text1)
    # create_word_cloud_from_words(tokens, "Stemmed Tokens")

    # tokens = TokenFixer.fix_broken_words(tokens)
    # nps = NLPParser.get_noun_phrases(text1)
    # create_word_cloud_from_words(nps, "Noun Phrases")

    tokens0 = TextExtractor.read_all_text(os.path.join(constants.DATA_PATH, 'author_topic_filtered', '1002.txt')).split(',')
    create_word_cloud_from_words(tokens0, "Topics")