def process(self, corpus, record_name, workers=8, max_queue_size=2000): """ 处理函数(主要调用的就是这个函数) 处理输入语料(corpus),最终转为tfrecord格式(record_name),生成对应的文件 自带多进程支持,如果cpu核心数多,请加入workers和max_queue_size. """ # 创建一个TFRecordWriter对象,这个对象(writer)就负责写记录到指定的文件中去了. # TFRecordWriter把记录写入到TFRecords文件的类. writer = tf.io.TFRecordWriter(record_name) globals()['count'] = 0 def write_to_tfrecord(serialized_instances): globals()['count'] += len(serialized_instances) for serialized_instance in serialized_instances: writer.write(serialized_instance) # 写入到文件中 def paragraph_process(texts): instances = self.paragraph_process(texts) # 段落处理 serialized_instances = self.tfrecord_serialize( instances) # 文本序列化(转化为tfrecord的字符串) return serialized_instances # 多进程/多线程处理 parallel_apply( func=paragraph_process, iterable=corpus, workers=workers, max_queue_size=max_queue_size, callback=write_to_tfrecord, ) writer.close() # 关闭对象. print('write %s examples into %s' % (globals()['count'], record_name))
def process(self, corpus, record_name, workers=8, max_queue_size=2000): """处理输入语料(corpus),最终转为tfrecord格式(record_name) 自带多进程支持,如果cpu核心数多,请加大workers和max_queue_size。 """ writer = tf.io.TFRecordWriter(record_name) globals()['count'] = 0 def write_to_tfrecord(results): globals()['count'] += len(results) for tf_serialized in results: writer.write(tf_serialized) def paragraph_process(texts): results = self.paragraph_process(texts) results = self.tfrecord_serialize(results) return results parallel_apply( func=paragraph_process, iterable=corpus, workers=workers, max_queue_size=max_queue_size, callback=write_to_tfrecord, ) writer.close() print('write %s examples into %s' % (count, record_name))
def convert(data): """分句,并转换为抽取式摘要 """ D = parallel_apply(func=extract_flow, iterable=tqdm(data, desc=u'转换数据'), workers=100, max_queue_size=200) total_metric = sum([d[3] for d in D]) D = [d[:3] for d in D] print(u'抽取结果的平均指标: %s' % (total_metric / len(D))) return D
for text in texts: for token in _tokenizer.tokenize(text): _tokens[token] = _tokens.get(token, 0) + 1 return _tokens tokens = {} def _total_count(result): for k, v in result.items(): tokens[k] = tokens.get(k, 0) + v # 10进程来完成词频统计 parallel_apply( func=_tokenize_and_count, iterable=tqdm(_batch_texts(), desc=u'构建词汇表中'), workers=10, max_queue_size=100, callback=_total_count, # dummy=True, # 如果在Windows跑,请设置dummy=True ) tokens = [(i, j) for i, j in tokens.items() if j >= min_count] tokens = sorted(tokens, key=lambda t: -t[1]) tokens = [t[0] for t in tokens] json.dump(tokens, open(seq2seq_config, 'w', encoding='utf-8'), indent=4, ensure_ascii=False) token_dict, keep_words = {}, [] # keep_words是在bert中保留的字表 for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']:
for text in texts: for token in _tokenizer.tokenize(text): _tokens[token] = _tokens.get(token, 0) + 1 return _tokens tokens = {} def _total_count(result): for k, v in result.items(): tokens[k] = tokens.get(k, 0) + v # 词频统计 parallel_apply( func=_tokenize_and_count, iterable=tqdm(_batch_texts(), desc=u'构建词汇表中'), workers=10, max_queue_size=500, callback=_total_count, ) tokens = [(i, j) for i, j in tokens.items() if j >= min_count] tokens = sorted(tokens, key=lambda t: -t[1]) tokens = [t[0] for t in tokens] token_dict, keep_words = {}, [] # keep_words是在bert中保留的字表 for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']: token_dict[t] = len(token_dict) keep_words.append(_token_dict[t]) for t in tokens:
batch = [] k += 1 if batch: yield batch, k def local_shuf(batch_k): batch, k = batch_k np.random.shuffle(batch) with open('corpus_local_shuf/%05d.json' % k, 'w') as f: for text in batch: f.write(text) parallel_apply( func=local_shuf, iterable=generator(), workers=5, max_queue_size=10 ) # # =========== 全局打乱 =========== # jsons = glob.glob('corpus_local_shuf/*.json') opens = [open(j) for j in jsons] n, k = 0, 0 F = open('corpus_shuf/%05d.json' % k, 'w') for i in tqdm(range(batch_size), ncols=0, desc='Global Shuffling'): orders = np.random.permutation(len(opens)) for i in orders: text = opens[i].readline()
if texts: yield texts def count(texts): tokens = {} for text in texts: for t in sp_model.encode_as_pieces(text): tokens[t] = tokens.get(t, 0) + 1 return tokens def callback(tokens): for k, v in tokens.items(): global_tokens[k] = global_tokens.get(k, 0) + v parallel_apply( func=count, iterable=tqdm(corpus()), workers=20, max_queue_size=1000, callback=callback, ) import pandas as pd dic = pd.Series(global_tokens).sort_values(ascending=False) dic.to_csv('result.csv', header=None, encoding='utf-8', sep='\t') json.dump(global_tokens, open('result.json', 'w'))