예제 #1
0
파일: __init__.py 프로젝트: StevenLOL/detie
def find_doc():
    _global['trie'] = build_trie()
    _global['words_set'] = SetData('output.txt')
    map_ = {}
    corpus = load_corpus()
    cpu_count = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=cpu_count)
    groups = corpus.block_groups(cpu_count, COUNT_STEP)
    i=0
    sum_ = float(9999000)/COUNT_STEP
    for group in groups:
        new_string_groups = pool.map(doc_extract, group)
        for new_strings in new_string_groups:
            for word, doc in new_strings.iteritems():
                if not map_.get(word):
                    map_[word]=doc
        i+=cpu_count
        logger.info("Computing: %.2f%%" % (i/sum_*100))
    return map_
예제 #2
0
파일: __init__.py 프로젝트: StevenLOL/detie
def count_new_strings():
    _global['trie'] = build_trie()
    corpus = load_corpus()
    counter = Counter()
    cpu_count = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=cpu_count)
    logger.info('Create pool of %d processes' % cpu_count)
    i=0
    sum_ = float(9999000)/COUNT_STEP
    groups = corpus.block_groups(cpu_count, COUNT_STEP)
    for group in groups:
        new_string_groups = pool.map(extract_process, group)
        for new_strings in new_string_groups:
            for str_ in new_strings:
                counter[str_] += 1
        i+=cpu_count
        logger.info("Computing: %.2f%% - [%d]" % (i/sum_*100, len(counter)))
    logger.info("Computing finished")
    return counter
예제 #3
0
파일: __init__.py 프로젝트: StevenLOL/detie
def load_corpus():
    logger.info("Loading corpus data")
    #corpus_data = NLPIRXMLData("NLPIR_weibo_content_corpus.xml", encoding='gbk') 
    corpus_data = DictData('COAE2014_task3', encoding='gbk')
    logger.info("Corpus data loaded")
    return corpus_data
예제 #4
0
파일: __init__.py 프로젝트: StevenLOL/detie
def build_trie():
    logger.info("Building trie tree")
    dict_data = DictData('COAE_Known_dict.txt')
    trie = Trie(dict_data)
    logger.info("Trie tree build success")
    return trie