예제 #1
0
import phrase_mining
import sys
import utils

arguments = sys.argv
print 'Running Phrase Mining...'

file_name = arguments[1]

# represents the minimum number of occurences you want each phrase to have.
min_support = 4

# represents the threshold for merging two words into a phrase. A lower value
# alpha leads to higher recall and lower precision,
alpha = 4

# length of the maximum phrase size
max_phrase_size = 10

phrase_miner = phrase_mining.PhraseMining(file_name, min_support,
                                          max_phrase_size, alpha)
partitioned_docs, index_vocab = phrase_miner.mine()
frequent_phrases = phrase_miner.get_frequent_phrases(min_support)
utils.store_partitioned_docs(partitioned_docs)
utils.store_vocab(index_vocab)
utils.store_frequent_phrases(frequent_phrases)
예제 #2
0
]
if len(arguments) > 6:
    stopwords_file = stop_word_files[int(arguments[6])]
    phrase_miner = og_phrase_mining.PhraseMining(file_name, min_support,
                                                 max_phrase_size, alpha,
                                                 stopwords_file)
else:
    phrase_miner = og_phrase_mining.PhraseMining(file_name, min_support,
                                                 max_phrase_size, alpha)
# phrase_miner = og_phrase_mining.PhraseMining(file_name, min_support, max_phrase_size, alpha);
partitioned_docs, index_vocab = phrase_miner.mine()
# print(partitioned_docs)
frequent_phrases = phrase_miner.get_frequent_phrases(min_support)
# print(frequent_phrases)
utils.store_partitioned_docs(
    partitioned_docs,
    path="src/topmine/{}/intermediate_output/{}.partitioneddocs.txt".format(
        folder,
        file_name.split('/')[-1]))
utils.store_vocab(
    index_vocab,
    path="src/topmine/{}/intermediate_output/{}.vocab.txt".format(
        folder,
        file_name.split('/')[-1]))
utils.store_frequent_phrases(
    frequent_phrases,
    path='src/topmine/{}/output/{}.frequent_phrases.txt'.format(
        folder,
        file_name.split('/')[-1]))

print(len(frequent_phrases))