def define_dictionary_n_grams(docs, start=1): print("define_dictionary_n_grams") import topmine_src.phrase_mining as phrase_mining import sys import topmine_src.utils as utils file_name = "vw_remont-i-stroitel_stvo_only_text" output_path = "remont_n-grams" # represents the minimum number of occurences you want each phrase to have. min_support = 10 # represents the threshold for merging two words into a phrase. A lower value # alpha leads to higher recall and lower precision, alpha = 4 # length of the maximum phrase size max_phrase_size = 10 phrase_miner = phrase_mining.PhraseMining(file_name, min_support, max_phrase_size, alpha) partitioned_docs, index_vocab = phrase_miner.mine() frequent_phrases = phrase_miner.get_frequent_phrases(min_support) utils.store_partitioned_docs(partitioned_docs) utils.store_vocab(index_vocab) utils.store_frequent_phrases(frequent_phrases, output_path) return {}
from topmine_src import phrase_mining import sys from topmine_src import utils arguments = sys.argv print('Running Phrase Mining...') file_name = arguments[1] # represents the minimum number of occurences you want each phrase to have. min_support = 10 # represents the threshold for merging two words into a phrase. A lower value # alpha leads to higher recall and lower precision, alpha = 4 # length of the maximum phrase size max_phrase_size = 10 phrase_miner = phrase_mining.PhraseMining(file_name, min_support, max_phrase_size, alpha) partitioned_docs, index_vocab = phrase_miner.mine() frequent_phrases = phrase_miner.get_frequent_phrases(min_support) utils.store_partitioned_docs(partitioned_docs) utils.store_vocab(index_vocab) utils.store_frequent_phrases(frequent_phrases)
def main(_): stop_word_file = FLAGS.stop_word_file file_name = FLAGS.train_file min_support = FLAGS.min_support max_phrase_size = FLAGS.max_phrase_size alpha = FLAGS.alpha beta = FLAGS.beta iteration = FLAGS.iteration num_topics = FLAGS.num_topics optimization_iterations = FLAGS.optimization_iterations optimization_burnin = FLAGS.optimization_burnin import jieba # with open(FLAGS.train_file, "r") as frobj: # examples = [line.strip() for line in frobj] # print(len(examples), "===before removing duplicate===") # examples = set(examples) # tmp = [] # for example in examples: # re_pattern = "({}{})".format("__label__", "\d.") # element_list = re.split(re_pattern, example) # tmp.append(" ".join(list(jieba.cut("".join(element_list[-1].split()))))) # examples = set(tmp) # print(len(examples), "===after removing duplicate===") train_file_list = FLAGS.train_file.split("&") examples = [] for train_file in train_file_list: with open(train_file, "r") as frobj: for line in tqdm(frobj): try: content = json.loads(line) text = " ".join(content["text"]) if len(text) >= 512: continue examples.append(text) except: continue def _get_stopwords(stop_word_path): """ Returns a list of stopwords. """ stopwords = set() with open(stop_word_path, "r") as frobj: for line in frobj: stopwords.add(line.rstrip()) return stopwords # stopwords = _get_stopwords(FLAGS.stop_word_file) stopwords = [] phrase_miner = phrase_mining.PhraseMining(min_support, max_phrase_size, alpha) partitioned_docs, index_vocab, partitioned_indexer = phrase_miner.mine( examples, stopwords) frequent_phrases = phrase_miner.get_frequent_phrases(min_support, if_only_phrase=False) partioned_docs_path = FLAGS.ouput_file + "/partioned_docs.txt" utils.store_partitioned_docs(partitioned_docs, path=partioned_docs_path) vocab_path = FLAGS.ouput_file + "/vocabs.txt" utils.store_vocab(index_vocab, path=vocab_path) frequent_phrase_path = FLAGS.ouput_file + "/frequent_phrases.txt" utils.store_frequent_phrases(frequent_phrases, path=frequent_phrase_path) print("{}: total frequent phrases {}".format(file_name, len(frequent_phrases))) # print('Running PhraseLDA...') # partitioned_docs = utils.load_partitioned_docs(path=partioned_docs_path) # vocab_file = utils.load_vocab(path=vocab_path) # plda = phrase_lda.PhraseLDA( partitioned_docs, vocab_file, num_topics , # alpha, beta, iteration, optimization_iterations, optimization_burnin); # document_phrase_topics, most_frequent_topics, topics = plda.run() # stored_topics_path = FLAGS.ouput_file + "/doc_phrase_topics.txt" # utils.store_phrase_topics(document_phrase_topics, # path=stored_topics_path) # most_frequent_topic_prefix_path = FLAGS.ouput_file + "/frequent_phrase_topics.txt" # utils.store_most_frequent_topics(most_frequent_topics, # prefix_path=most_frequent_topic_prefix_path) import _pickle as pkl pkl.dump( { "frequent_phrases": frequent_phrases, "index_vocab": index_vocab, "partitioned_docs": partitioned_docs, "indexer": partitioned_indexer }, open(FLAGS.ouput_file + "/mining_info.pkl", "wb"))
def main(_): stop_word_file = FLAGS.stop_word_file file_name = FLAGS.train_file min_support = FLAGS.min_support max_phrase_size = FLAGS.max_phrase_size alpha = FLAGS.alpha beta = FLAGS.beta iteration = FLAGS.iteration num_topics = FLAGS.num_topics optimization_iterations = FLAGS.optimization_iterations optimization_burnin = FLAGS.optimization_burnin train_file_list = FLAGS.train_file.split("&") examples = [] for train_file in train_file_list: with open(train_file, "r") as frobj: for line in tqdm(frobj): try: content = line.strip() # examples.append(" ".join(list(content))) examples.append(" ".join(list(jieba.cut(clean(content))))) except: continue def _get_stopwords(stop_word_path): """ Returns a list of stopwords. """ stopwords = set() with open(stop_word_path, "r") as frobj: for line in frobj: stopwords.add(line.rstrip()) return stopwords # stopwords = [] stopwords = _get_stopwords(FLAGS.stop_word_file) print("==total example==", len(examples)) phrase_miner = phrase_mining.PhraseMining(min_support, max_phrase_size, alpha) partitioned_docs, index_vocab, partitioned_indexer = phrase_miner.mine( examples, stopwords) frequent_phrases = phrase_miner.get_frequent_phrases(min_support, if_only_phrase=False) partioned_docs_path = FLAGS.ouput_file + "/partioned_docs.txt" utils.store_partitioned_docs(partitioned_docs, path=partioned_docs_path) vocab_path = FLAGS.ouput_file + "/vocabs.txt" utils.store_vocab(index_vocab, path=vocab_path) frequent_phrase_path = FLAGS.ouput_file + "/frequent_phrases.txt" utils.store_frequent_phrases(frequent_phrases, path=frequent_phrase_path) print("{}: total frequent phrases {}".format(file_name, len(frequent_phrases))) # print('Running PhraseLDA...') # partitioned_docs = utils.load_partitioned_docs(path=partioned_docs_path) # vocab_file = utils.load_vocab(path=vocab_path) # plda = phrase_lda.PhraseLDA( partitioned_docs, vocab_file, num_topics , # alpha, beta, iteration, optimization_iterations, optimization_burnin); # document_phrase_topics, most_frequent_topics, topics = plda.run() # stored_topics_path = FLAGS.ouput_file + "/doc_phrase_topics.txt" # utils.store_phrase_topics(document_phrase_topics, # path=stored_topics_path) # most_frequent_topic_prefix_path = FLAGS.ouput_file + "/frequent_phrase_topics.txt" # utils.store_most_frequent_topics(most_frequent_topics, # prefix_path=most_frequent_topic_prefix_path) import _pickle as pkl pkl.dump( { "frequent_phrases": frequent_phrases, "index_vocab": index_vocab, "partitioned_docs": partitioned_docs, "indexer": partitioned_indexer }, open(FLAGS.ouput_file + "/mining_info.pkl", "wb"))