def find_doc(): _global['trie'] = build_trie() _global['words_set'] = SetData('output.txt') map_ = {} corpus = load_corpus() cpu_count = multiprocessing.cpu_count() pool = multiprocessing.Pool(processes=cpu_count) groups = corpus.block_groups(cpu_count, COUNT_STEP) i=0 sum_ = float(9999000)/COUNT_STEP for group in groups: new_string_groups = pool.map(doc_extract, group) for new_strings in new_string_groups: for word, doc in new_strings.iteritems(): if not map_.get(word): map_[word]=doc i+=cpu_count logger.info("Computing: %.2f%%" % (i/sum_*100)) return map_
def count_new_strings(): _global['trie'] = build_trie() corpus = load_corpus() counter = Counter() cpu_count = multiprocessing.cpu_count() pool = multiprocessing.Pool(processes=cpu_count) logger.info('Create pool of %d processes' % cpu_count) i=0 sum_ = float(9999000)/COUNT_STEP groups = corpus.block_groups(cpu_count, COUNT_STEP) for group in groups: new_string_groups = pool.map(extract_process, group) for new_strings in new_string_groups: for str_ in new_strings: counter[str_] += 1 i+=cpu_count logger.info("Computing: %.2f%% - [%d]" % (i/sum_*100, len(counter))) logger.info("Computing finished") return counter
def load_corpus(): logger.info("Loading corpus data") #corpus_data = NLPIRXMLData("NLPIR_weibo_content_corpus.xml", encoding='gbk') corpus_data = DictData('COAE2014_task3', encoding='gbk') logger.info("Corpus data loaded") return corpus_data
def build_trie(): logger.info("Building trie tree") dict_data = DictData('COAE_Known_dict.txt') trie = Trie(dict_data) logger.info("Trie tree build success") return trie