def single_input(txt, max_seq_len, vocab_dict, ischar, hascls, hassep, stops=[]): ''' 输入txt,按最大长度将文本进行裁切。获得格式化的单个文本。 params: txt:文本; max_seq_len:文本最大长度; vocab_dict:词典; ischar=True表示分词成一个个字,否则正常分词; cls:表示开头是否包含[CLS] sep: 表示结尾是否包含[SEP] 返回三个列表:tokens_ids,tokens_mask,tokens_seg bert的输入一般如下所示: (a) 两个句子: tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 (b) 单个句子: tokens: [CLS] the dog is hairy . [SEP] type_ids: 0 0 0 0 0 0 0 type_ids 在源码中对应的是 segment_ids,主要用于区分第一个第二个句子。 第一个句子为0,第二个句子是1。在预训练的时候会添加到单词的的向量中,但这个不是必须的 因为[SEP] 已经区分了第一个句子和第二个句子。但type_ids 会让学习变的简单 mask的作用是处理含有文字的内容,不处理填充[PAD]的内容。 ''' assert "[PAD]" in vocab_dict.keys() if hascls: assert "[CLS]" in vocab_dict.keys() if hassep: assert "[SEP]" in vocab_dict.keys() split_tokens = myVocab.tokenize(txt, ischar=ischar, stops=stops) tokens_ids = myVocab.tokens_to_ids(split_tokens, vocab_dict, skip=True) # 添加cls和sep if hascls: tokens_ids = [vocab_dict["[CLS]"]] + tokens_ids[:max_seq_len - 1] else: tokens_ids = tokens_ids[:max_seq_len] if hassep: if len(tokens_ids) == max_seq_len: tokens_ids[-1] = vocab_dict["[SEP]"] else: tokens_ids += [vocab_dict["[SEP]"]] # 统一到相同长度 tokens_mask = [1] * len(tokens_ids) + [0] * (max_seq_len - len(tokens_ids)) tokens_seg = [0] * max_seq_len tokens_ids += [vocab_dict["[PAD]"]] * (max_seq_len - len(tokens_ids)) return tokens_ids, tokens_mask, tokens_seg
def format_file(file, stops): ''' 读取文档,格式化文档,并返回纯文本。 纯文本格式为:一行一句话,一句话里的词都用空格分割开了。 ''' with open(file, "r", encoding="utf-8") as f: lines = f.readlines() res = [] for line in lines: line = myVocab.tokenize(line.strip(), ischar=False, stops=stops) line = " ".join(line) res.append(line) res = "\n".join(res) res = myRe.del_repeat(res, "\n") return res
def fuzzy(src_file, dest_file, stops_path, freqs_path): def _fuzzy(txt): # 替换顺序不能变 txt = re.sub( "(第?[零〇一二两三四五六七八九十0123456789]{1,3}个?[月日])|([零〇一二两三四五六七八九十0123456789]{4,5}年)|(第?[零〇一二两三四五六七八九十百千0123456789]{1,5}[条款])", "", txt) txt = re.sub("[零〇一二两三四五六七八九十0123456789]{2,3}年", "较长年份", txt) txt = re.sub("[零〇一二两012]年", "较短年份", txt) txt = re.sub("[三四五六七八九十3456789(10)]年", "中等年份", txt) #print("txt",txt) txt = re.sub("第?[1一]次", "较少次", txt) txt = re.sub("第?[0-9零〇一二两三四五六七八九十]{1,}次", "较多次", txt) #print(txt) txt = re.sub( "([0-9]{5,}元)|([一二两三四五六七八九十]万[零〇一二两三四五六七八九十千百]*元)|([5-9][0-9]{3,3}元)|([五六七八九]千[零〇一二两三四五六七八九十百]+元)", "较大金额", txt) txt = re.sub("([1-4][0-9]{3,3}元)|([一二两三四五]千[零〇一二两三四五六七八九十百]*元)", "中等金额", txt) txt = re.sub("[0-9]{1,3}元|[零〇一二两三四五六七八九十百]+元", "较少金额", txt) # regex = [ "[0-9]+(\\.[0-9]+)?(克|元)", "[零〇一二两三四五六七八九十百千]+[元克]", "(出生于|户籍地)(.+?)[,,.。]", "车牌号(.*)[0-9A-Za-z]", "[0-9零〇一二两三四五六七八九十]{1,4}年", "[0-9零〇一二两三四五六七八九十][0-9零〇一二两三四五六七八九十]?个?[月日]", "在(.+?)(宾馆|酒店|会所|家中|大道|街|饭店)", "身份证[号]?[A-Za-z0-9]{18}", "(OPPO|苹果|三星)[牌]?手机" ] for reg in regex: txt = re.sub(reg, "", txt) return txt res = [] stops = myUtils.get_list_by_line(stops_path) freqs = myUtils.json_file_to_dict(freqs_path) with open(src_file, "r", encoding="utf-8") as f: lines = f.readlines() for line in lines: if line.strip() != "": line = line.strip().split("\t") # 处理文本 txt = _fuzzy(line[-1]) txt_list = myVocab.tokenize(txt, ischar=False, stops=stops) txt_list = myVocab.filter_freq(txt_list, freqs, 4, 1e12, skip=False) txt_list = myRe.filter_location(txt_list) txt = "".join(txt_list) txt = re.sub(",+", ",", txt) try: if txt[0] == ",": txt = txt[1:] if txt[-1] == ",": txt = txt[:-1] except: print(str(lines), lines[3], txt_list) input(line) line[-1] = txt res.append("\t".join(line)) # 写入文件 myUtils.write_text_into_file("\n".join(res), dest_file) return