def _produce_database_for(paths, env_path): env = lmdb.open(env_path + '-tmp', 536870912000, subdir=False, writemap=True, map_async=True, lock=False) all = 0 for p in paths: all += utility.read_lines_from(p) pbar = tqdm.tqdm(total=all) for p in paths: with open(p, mode='r', encoding=kGB18030) as f: with env.begin(write=True) as t: for line in f: pbar.update() try: word, valueStr = line.strip().split('\t') count = int(valueStr) except: continue key = word.encode(encoding=kGB18030) c = t.get(key) if c is not None: count += struct.unpack('i', c)[0] t.replace(key, struct.pack("i", count)) else: t.put(key, struct.pack("i", count), dupdata=False) pbar.close() env.copy(env_path, compact=True) env.close() os.remove(env_path + '-tmp') gc.collect()
def process(process_num: int = 10, mem_limit_gb: int = 10): PROCESS_NUM = process_num MEMORY_LIMIT_GB = mem_limit_gb / PROCESS_NUM utility.load_user_data_jieba() print('💭开始统计语料总条目数...') total_counts = utility.read_lines_from(DATA_TXT_FILE) print(''' 🤓 统计完成! |--- 文本行数:{} '''.format(total_counts)) print('👓 开始统计转移...') pbar = tqdm.tqdm(total=total_counts) deleteMBD() jobs = [] queue = multiprocessing.Queue(10000) for _ in range(0, PROCESS_NUM): p = multiprocessing.Process(target=processing_line, args=(queue, )) jobs.append(p) p.start() f = open(DATA_TXT_FILE, encoding='gb18030') # 只读取需要的部分,不再一次性加载全文 for line in f: pbar.update(1) # 挨个往子进程里送字符串进行处理 while queue.full(): pass queue.put(line) f.close() pbar.close() while queue.full(): pass queue.put(kEndProcess) start_time = datetime.datetime.now() print('Waiting subprocess to exit') for p in jobs: while p.is_alive(): pass print('Waiting temp file to database') tmp_to_database() print('Generating py to words json file') gen_py_words_json() print( 'Total waiting: {:.2f}'.format( (datetime.datetime.now() - start_time).seconds / 60 / 60), 'h') print('🎉️完成!')
def gen_py_words_json(): print('|---生成拼音到 Gram 数据') transition_1gram_data.clear() print('|---解压缩 Uni-Gram') path = "{}/{}.txt".format(kMMKV_DATABASE, kTransition1gram) pbar = tqdm.tqdm(total=utility.read_lines_from(path)) with open(path, 'r') as f: for line in f: pbar.update() try: k, c = line.strip().split('\t') except: continue count = int(c) transition_1gram_data[k] = count pbar.close() print('|--- 写入文件...') target = open(WORD_FREQ, mode='w', encoding='utf8') gram1data = [] for word, weight in sorted(transition_1gram_data.items(), key=operator.itemgetter(1), reverse=True): py = utility.get_pinyin_list(word) pys = ''.join(py) gram1data.append((word, "'".join(py), weight)) target.write('{}\t{}\t{}\n'.format(word, "'".join(py), weight)) if len(py) == 2 and utility.is_pinyin(pys): # 如果词是两个字,但拼音能合在一起,那么就额外添加个条目当作单字处理一次 gram1data.append((word, pys, weight)) target.write('{}\t{}\t{}\n'.format(word, pys, weight)) target.close() py2words_data = {} for word, py, w in gram1data: py2words_data.setdefault(py, []) py2words_data[py].append(word) for py, words in py2words_data.items(): py2words_data[py] = list(set(py2words_data[py])) utility.writejson2file(py2words_data, PY2WORDSFILE) utility.writejson2file(transition_1gram_data, GRAM1FILE)
def _produce_database_in_memory(paths, env_path): env = lmdb.open(env_path + '-tmp', 536870912000, subdir=False, writemap=True, map_async=True, lock=False) data = {} all = 0 for p in paths: all += utility.read_lines_from(p) pbar = tqdm.tqdm(total=all) for p in paths: with open(p, mode='r', encoding=kGB18030) as f: for line in f: pbar.update() try: word, valueStr = line.strip().split('\t') except: continue count = int(valueStr) if word in data: data[word] += count else: data[word] = count pbar.close() pbar = tqdm.tqdm(total=len(data)) with env.begin(write=True) as t: for word, count in data.items(): pbar.update() key = word.encode(encoding=kGB18030) t.put(key, struct.pack("i", count), dupdata=False) env.copy(env_path, compact=True) env.close() os.remove(env_path + '-tmp') data.clear() del data pbar.close() gc.collect()
def gen_data_txt(process_num: int = 10, mem_limit_gb: int = 10): print('💭开始统计资料总条目数...') all_files = [] total_counts = 0 for root, directories, filenames in os.walk(ARTICLE_DIR): for filename in filenames: p = os.path.join(root, filename) if p.endswith('.txt'): n = utility.read_lines_from(p) if n == -1: print(p, '⚠️ Wrong encoding!') continue all_files.append(p) total_counts += n print(''' 🤓 统计完成! |---文件数:{} |---文本行数:{} '''.format(len(all_files), total_counts)) remove_tmp_file() pbar = tqdm.tqdm(total=total_counts) queue = multiprocessing.Queue(10000) jobs = [] for _ in range(0, PROCESS_NUM): p = multiprocessing.Process(target=processing_line, args=(queue, process_num, mem_limit_gb)) jobs.append(p) p.start() for path in all_files: f = open(path, encoding='gb18030') try: line = f.readline() except: f.close() if f.closed: f = open(path, encoding='utf8') try: line = f.readline() except: f.close() if f.closed: print('Wrong encoding of file {}, bypassing...'.format(path)) continue del line f.seek(0, 0) # 只读取需要的部分,不再一次性加载全文 for line in f: pbar.update(1) # 挨个往子进程里送字符串进行处理 while queue.full(): pass queue.put(line) f.close() pbar.close() queue.put(kEndProcess) print('Waiting subprocess to exit') for p in jobs: while p.is_alive(): pass print('合并缓存……') sumup_tmp_files() print('🗃 语料预处理完成!')