示例#1
0
def single_input(txt,
                 max_seq_len,
                 vocab_dict,
                 ischar,
                 hascls,
                 hassep,
                 stops=[]):
    '''
    输入txt,按最大长度将文本进行裁切。获得格式化的单个文本。
    
    params:
        txt:文本;
        max_seq_len:文本最大长度;
        vocab_dict:词典;
        ischar=True表示分词成一个个字,否则正常分词;
        cls:表示开头是否包含[CLS]
        sep: 表示结尾是否包含[SEP]
    返回三个列表:tokens_ids,tokens_mask,tokens_seg
    
    bert的输入一般如下所示:
        (a) 两个句子:
        tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        (b) 单个句子:
        tokens:   [CLS] the dog is hairy . [SEP]
        type_ids: 0     0   0   0  0     0 0
        
        type_ids 在源码中对应的是 segment_ids,主要用于区分第一个第二个句子。
        第一个句子为0,第二个句子是1。在预训练的时候会添加到单词的的向量中,但这个不是必须的
        因为[SEP] 已经区分了第一个句子和第二个句子。但type_ids 会让学习变的简单
        
        mask的作用是处理含有文字的内容,不处理填充[PAD]的内容。
    '''
    assert "[PAD]" in vocab_dict.keys()
    if hascls:
        assert "[CLS]" in vocab_dict.keys()
    if hassep:
        assert "[SEP]" in vocab_dict.keys()

    split_tokens = myVocab.tokenize(txt, ischar=ischar, stops=stops)
    tokens_ids = myVocab.tokens_to_ids(split_tokens, vocab_dict, skip=True)
    # 添加cls和sep
    if hascls:
        tokens_ids = [vocab_dict["[CLS]"]] + tokens_ids[:max_seq_len - 1]
    else:
        tokens_ids = tokens_ids[:max_seq_len]
    if hassep:
        if len(tokens_ids) == max_seq_len:
            tokens_ids[-1] = vocab_dict["[SEP]"]
        else:
            tokens_ids += [vocab_dict["[SEP]"]]
    # 统一到相同长度
    tokens_mask = [1] * len(tokens_ids) + [0] * (max_seq_len - len(tokens_ids))
    tokens_seg = [0] * max_seq_len
    tokens_ids += [vocab_dict["[PAD]"]] * (max_seq_len - len(tokens_ids))
    return tokens_ids, tokens_mask, tokens_seg
示例#2
0
def format_file(file, stops):
    '''
    读取文档,格式化文档,并返回纯文本。
    纯文本格式为:一行一句话,一句话里的词都用空格分割开了。
    '''
    with open(file, "r", encoding="utf-8") as f:
        lines = f.readlines()
    res = []
    for line in lines:
        line = myVocab.tokenize(line.strip(), ischar=False, stops=stops)
        line = " ".join(line)
        res.append(line)
    res = "\n".join(res)
    res = myRe.del_repeat(res, "\n")
    return res
示例#3
0
def fuzzy(src_file, dest_file, stops_path, freqs_path):
    def _fuzzy(txt):
        # 替换顺序不能变
        txt = re.sub(
            "(第?[零〇一二两三四五六七八九十0123456789]{1,3}个?[月日])|([零〇一二两三四五六七八九十0123456789]{4,5}年)|(第?[零〇一二两三四五六七八九十百千0123456789]{1,5}[条款])",
            "", txt)
        txt = re.sub("[零〇一二两三四五六七八九十0123456789]{2,3}年", "较长年份", txt)
        txt = re.sub("[零〇一二两012]年", "较短年份", txt)
        txt = re.sub("[三四五六七八九十3456789(10)]年", "中等年份", txt)
        #print("txt",txt)
        txt = re.sub("第?[1一]次", "较少次", txt)
        txt = re.sub("第?[0-9零〇一二两三四五六七八九十]{1,}次", "较多次", txt)
        #print(txt)
        txt = re.sub(
            "([0-9]{5,}元)|([一二两三四五六七八九十]万[零〇一二两三四五六七八九十千百]*元)|([5-9][0-9]{3,3}元)|([五六七八九]千[零〇一二两三四五六七八九十百]+元)",
            "较大金额", txt)
        txt = re.sub("([1-4][0-9]{3,3}元)|([一二两三四五]千[零〇一二两三四五六七八九十百]*元)",
                     "中等金额", txt)
        txt = re.sub("[0-9]{1,3}元|[零〇一二两三四五六七八九十百]+元", "较少金额", txt)

        #
        regex = [
            "[0-9]+(\\.[0-9]+)?(克|元)", "[零〇一二两三四五六七八九十百千]+[元克]",
            "(出生于|户籍地)(.+?)[,,.。]", "车牌号(.*)[0-9A-Za-z]",
            "[0-9零〇一二两三四五六七八九十]{1,4}年",
            "[0-9零〇一二两三四五六七八九十][0-9零〇一二两三四五六七八九十]?个?[月日]",
            "在(.+?)(宾馆|酒店|会所|家中|大道|街|饭店)", "身份证[号]?[A-Za-z0-9]{18}",
            "(OPPO|苹果|三星)[牌]?手机"
        ]
        for reg in regex:
            txt = re.sub(reg, "", txt)

        return txt

    res = []
    stops = myUtils.get_list_by_line(stops_path)
    freqs = myUtils.json_file_to_dict(freqs_path)
    with open(src_file, "r", encoding="utf-8") as f:
        lines = f.readlines()
    for line in lines:
        if line.strip() != "":
            line = line.strip().split("\t")
            # 处理文本
            txt = _fuzzy(line[-1])
            txt_list = myVocab.tokenize(txt, ischar=False, stops=stops)
            txt_list = myVocab.filter_freq(txt_list,
                                           freqs,
                                           4,
                                           1e12,
                                           skip=False)
            txt_list = myRe.filter_location(txt_list)
            txt = "".join(txt_list)
            txt = re.sub(",+", ",", txt)
            try:
                if txt[0] == ",":
                    txt = txt[1:]
                if txt[-1] == ",":
                    txt = txt[:-1]
            except:
                print(str(lines), lines[3], txt_list)
                input(line)
            line[-1] = txt
            res.append("\t".join(line))
    # 写入文件
    myUtils.write_text_into_file("\n".join(res), dest_file)
    return