示例#1
0
def _produce_database_for(paths, env_path):
    env = lmdb.open(env_path + '-tmp',
                    536870912000,
                    subdir=False,
                    writemap=True,
                    map_async=True,
                    lock=False)
    all = 0
    for p in paths:
        all += utility.read_lines_from(p)
    pbar = tqdm.tqdm(total=all)
    for p in paths:
        with open(p, mode='r', encoding=kGB18030) as f:
            with env.begin(write=True) as t:
                for line in f:
                    pbar.update()
                    try:
                        word, valueStr = line.strip().split('\t')
                        count = int(valueStr)
                    except:
                        continue

                    key = word.encode(encoding=kGB18030)
                    c = t.get(key)
                    if c is not None:
                        count += struct.unpack('i', c)[0]
                        t.replace(key, struct.pack("i", count))
                    else:
                        t.put(key, struct.pack("i", count), dupdata=False)
    pbar.close()
    env.copy(env_path, compact=True)
    env.close()
    os.remove(env_path + '-tmp')
    gc.collect()
示例#2
0
def process(process_num: int = 10, mem_limit_gb: int = 10):
    PROCESS_NUM = process_num
    MEMORY_LIMIT_GB = mem_limit_gb / PROCESS_NUM
    utility.load_user_data_jieba()
    print('💭开始统计语料总条目数...')
    total_counts = utility.read_lines_from(DATA_TXT_FILE)
    print('''
    🤓 统计完成!
    |--- 文本行数:{}
    '''.format(total_counts))

    print('👓 开始统计转移...')
    pbar = tqdm.tqdm(total=total_counts)
    deleteMBD()
    jobs = []
    queue = multiprocessing.Queue(10000)
    for _ in range(0, PROCESS_NUM):
        p = multiprocessing.Process(target=processing_line, args=(queue, ))
        jobs.append(p)
        p.start()

    f = open(DATA_TXT_FILE, encoding='gb18030')
    # 只读取需要的部分,不再一次性加载全文
    for line in f:
        pbar.update(1)
        # 挨个往子进程里送字符串进行处理
        while queue.full():
            pass
        queue.put(line)

    f.close()
    pbar.close()
    while queue.full():
        pass
    queue.put(kEndProcess)
    start_time = datetime.datetime.now()
    print('Waiting subprocess to exit')
    for p in jobs:
        while p.is_alive():
            pass
    print('Waiting temp file to database')
    tmp_to_database()
    print('Generating py to words json file')
    gen_py_words_json()
    print(
        'Total waiting: {:.2f}'.format(
            (datetime.datetime.now() - start_time).seconds / 60 / 60), 'h')
    print('🎉️完成!')
示例#3
0
def gen_py_words_json():
    print('|---生成拼音到 Gram 数据')
    transition_1gram_data.clear()

    print('|---解压缩 Uni-Gram')
    path = "{}/{}.txt".format(kMMKV_DATABASE, kTransition1gram)

    pbar = tqdm.tqdm(total=utility.read_lines_from(path))

    with open(path, 'r') as f:
        for line in f:
            pbar.update()
            try:
                k, c = line.strip().split('\t')
            except:
                continue

            count = int(c)
            transition_1gram_data[k] = count

    pbar.close()

    print('|--- 写入文件...')
    target = open(WORD_FREQ, mode='w', encoding='utf8')
    gram1data = []
    for word, weight in sorted(transition_1gram_data.items(),
                               key=operator.itemgetter(1),
                               reverse=True):
        py = utility.get_pinyin_list(word)
        pys = ''.join(py)
        gram1data.append((word, "'".join(py), weight))
        target.write('{}\t{}\t{}\n'.format(word, "'".join(py), weight))
        if len(py) == 2 and utility.is_pinyin(pys):
            # 如果词是两个字,但拼音能合在一起,那么就额外添加个条目当作单字处理一次
            gram1data.append((word, pys, weight))
            target.write('{}\t{}\t{}\n'.format(word, pys, weight))
    target.close()

    py2words_data = {}
    for word, py, w in gram1data:
        py2words_data.setdefault(py, [])
        py2words_data[py].append(word)

    for py, words in py2words_data.items():
        py2words_data[py] = list(set(py2words_data[py]))

    utility.writejson2file(py2words_data, PY2WORDSFILE)
    utility.writejson2file(transition_1gram_data, GRAM1FILE)
示例#4
0
def _produce_database_in_memory(paths, env_path):
    env = lmdb.open(env_path + '-tmp',
                    536870912000,
                    subdir=False,
                    writemap=True,
                    map_async=True,
                    lock=False)
    data = {}
    all = 0
    for p in paths:
        all += utility.read_lines_from(p)
    pbar = tqdm.tqdm(total=all)
    for p in paths:
        with open(p, mode='r', encoding=kGB18030) as f:
            for line in f:
                pbar.update()
                try:
                    word, valueStr = line.strip().split('\t')
                except:
                    continue
                count = int(valueStr)
                if word in data:
                    data[word] += count
                else:
                    data[word] = count
    pbar.close()
    pbar = tqdm.tqdm(total=len(data))
    with env.begin(write=True) as t:
        for word, count in data.items():
            pbar.update()
            key = word.encode(encoding=kGB18030)
            t.put(key, struct.pack("i", count), dupdata=False)
    env.copy(env_path, compact=True)
    env.close()
    os.remove(env_path + '-tmp')
    data.clear()
    del data
    pbar.close()
    gc.collect()
示例#5
0
def gen_data_txt(process_num: int = 10, mem_limit_gb: int = 10):

    print('💭开始统计资料总条目数...')
    all_files = []
    total_counts = 0
    for root, directories, filenames in os.walk(ARTICLE_DIR):
        for filename in filenames:
            p = os.path.join(root, filename)
            if p.endswith('.txt'):
                n = utility.read_lines_from(p)
                if n == -1:
                    print(p, '⚠️ Wrong encoding!')
                    continue
                all_files.append(p)
                total_counts += n
    print('''
        🤓 统计完成!
        |---文件数:{}
        |---文本行数:{}
        '''.format(len(all_files), total_counts))
    remove_tmp_file()
    pbar = tqdm.tqdm(total=total_counts)
    queue = multiprocessing.Queue(10000)
    jobs = []
    for _ in range(0, PROCESS_NUM):
        p = multiprocessing.Process(target=processing_line,
                                    args=(queue, process_num, mem_limit_gb))
        jobs.append(p)
        p.start()

    for path in all_files:
        f = open(path, encoding='gb18030')
        try:
            line = f.readline()
        except:
            f.close()

        if f.closed:
            f = open(path, encoding='utf8')
            try:
                line = f.readline()
            except:
                f.close()
        if f.closed:
            print('Wrong encoding of file {}, bypassing...'.format(path))
            continue
        del line
        f.seek(0, 0)
        # 只读取需要的部分,不再一次性加载全文
        for line in f:
            pbar.update(1)
            # 挨个往子进程里送字符串进行处理
            while queue.full():
                pass
            queue.put(line)
        f.close()

    pbar.close()

    queue.put(kEndProcess)
    print('Waiting subprocess to exit')
    for p in jobs:
        while p.is_alive():
            pass
    print('合并缓存……')
    sumup_tmp_files()
    print('🗃 语料预处理完成!')