Пример #1
0
 def params_init(self):
     """超参数初始化"""
     # params
     path_params = os.path.join(self.path_dir, "params.json")
     self.params = load_json(path_params)
     self.len_max = self.params["len_max"]
     self.level_type = self.params["level_type"]
Пример #2
0
 def load_pinyin_dict(self):
     """
         加载默认的拼音pinyin字典
     :return: None
     """
     dict_pinyin = load_json(path_dict_pinyin)[0]  # 加载json字典文件
     for k, v in dict_pinyin.items():
         self.dict_pinyin[k] = v
Пример #3
0
 def load_macropodus_dict(self):
     """
         加载默认的基础字典
     :return: None
     """
     dict_macropodus = load_json(path_dict_macropodus)[
         0]  # (path_dict_jiagu)[0] # (path_dict_macropodus)[0] # 加载json字典文件
     dict_macropodus_def = defaultdict()  # 转为defaultdict
     for k, v in dict_macropodus.items():
         dict_macropodus_def[k] = v
     self.dict_words_freq = dict_macropodus_def  # {}词-词频字典
Пример #4
0
 def load_user_dict(self, path_user=path_dict_user, type_user="******"):
     """
         加载用户词典
     :param path_user:str, like '/home/user.dict' 
     :return: None
     """
     if not os.path.exists(path_user):
         raise RuntimeError("your path_user is not exist!")
     if type_user == "json":
         self.dict_user = load_json(path_user)[0]  # 加载json字典文件
         for k, v in self.dict_user.items():
             if k not in self.dict_words_freq:
                 self.dict_words_freq[k] = v  # 更新到总字典, words_freq
             else:
                 self.dict_words_freq[
                     k] = self.dict_words_freq[k] + v  # 更新到总字典, words_freq
         self.num_words = sum(self.dict_words_freq.values())
     elif type_user == "txt":
         words_all = txt_read(path_user)
         for word_freq in words_all:
             wf = word_freq.split(" ")  # 空格' '区分带不带词频的情况
             if len(wf) == 2:
                 word = wf[0]
                 freq = wf[1]
             else:
                 word = wf[0]
                 freq = 132
             if word not in self.dict_words_freq:
                 self.dict_words_freq[word] = freq  # 更新到总字典, words_freq
             else:
                 self.dict_words_freq[word] = self.dict_words_freq[
                     word] + freq  # 更新到总字典, words_freq
         self.num_words = sum(self.dict_words_freq.values())
     elif type_user == "csv":
         words_all = txt_read(path_user)
         for word_freq in words_all:
             wf = word_freq.split(",")  # 逗号','区分带不带词频的情况
             if len(wf) == 2:
                 word = wf[0]
                 freq = wf[1]
             else:
                 word = wf[0]
                 freq = 132
             if word not in self.dict_words_freq:
                 self.dict_words_freq[word] = freq  # 更新到总字典, words_freq
             else:
                 self.dict_words_freq[word] = self.dict_words_freq[
                     word] + freq  # 更新到总字典, words_freq
         self.num_words = sum(self.dict_words_freq.values())
     else:
         raise EOFError
Пример #5
0
# 加载模型权重
model.load_weights(path_dir + "/model.h5")

# reader tokenizer
token_dict = {}
path_dict = os.path.join(path_model_dir, "vocab.txt")
with codecs.open(path_dict, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict = json.loads(token)

vocab_size = len(token_dict)
tokenizer = Tokenizer(token_dict)
# params
path_params = path_dir + "/params.json"
params = load_json(path_params)
len_max = params["len_max"]
# l2i_i2l
path_l2i_i2l = path_dir + "/l2i_i2l.json"
l2i_i2l = load_json(path_l2i_i2l)


def sentence2idx(text):
    text = extract_chinese(str(text).upper())
    text = list(text)
    text = [text_one for text_one in text]
    len_leave = len_max - len(text)
    if len_leave >= 0:
        text_index = [
            token_dict[text_char]
            if text_char in token_dict else token_dict['[UNK]']
    def preprocess_label_question_to_idx_fit(self,
                                             embedding_type,
                                             path,
                                             embed,
                                             rate=1,
                                             crf_mode='reg'):
        """
            fit用, 关键:对每一条数据操作,获取label和问句index              
        :param embedding_type: str, like 'albert'
        :param path: str, like 'train.json'
        :param embed: class, like embed
        :param rate: float, like 0.9
        :param crf_mode: str, like 'reg', 'pad'
        :return: np.array
        """
        # 首先获取label,set,即存在的具体类
        label_set, len_all = self.preprocess_label2set(path, embedding_type)
        # 获取label转index字典等, 如果label2index存在则不转换了, dev验证集合的时候用
        if not os.path.exists(self.path_model_l2i_i2l):
            count = 0
            label2index = {}
            index2label = {}
            for label_one in label_set:
                label2index[label_one] = count
                index2label[count] = label_one
                count = count + 1
            l2i_i2l = {}
            l2i_i2l['l2i'] = label2index
            l2i_i2l['i2l'] = index2label
            save_json(l2i_i2l, self.path_model_l2i_i2l)
        else:
            l2i_i2l = load_json(self.path_model_l2i_i2l)

        # 读取数据的比例
        len_ql = int(rate * len_all)
        if len_ql <= 500:  # sample时候不生效,使得语料足够训练
            len_ql = len_all

        def process_line(line, embed, l2i_i2l):
            """
                对每一条数据操作,获取label和问句index
            :param line: 
            :param embed: 
            :param l2i_i2l: 
            :return: 
            """
            # 对每一条数据操作,对question和label进行padding
            ques_label = json.loads(line.strip())
            label_org = ques_label["label"]
            label_index = [l2i_i2l["l2i"][lr] for lr in label_org]
            # len_sequence = len(label_index)
            que_embed = embed.sentence2idx("".join(ques_label["question"]))
            # label padding
            if embedding_type in ['bert', 'albert']:
                # padding label
                len_leave = embed.len_max - len(label_index) - 2
                if len_leave >= 0:
                    label_index_leave = [l2i_i2l["l2i"]["<CLS>"]] + [
                        li for li in label_index
                    ] + [l2i_i2l["l2i"]["<PAD>"] for _ in range(len_leave)
                         ] + [l2i_i2l["l2i"]["<SEP>"]]
                else:
                    label_index_leave = [
                        l2i_i2l["l2i"]["<CLS>"]
                    ] + label_index[0:embed.len_max -
                                    2] + [l2i_i2l["l2i"]["<SEP>"]]
            else:
                # padding label
                len_leave = embed.len_max - len(label_index)  # -2
                if len_leave >= 0:
                    label_index_leave = [li for li in label_index] + [
                        l2i_i2l["l2i"]["<PAD>"] for i in range(len_leave)
                    ]
                else:
                    label_index_leave = label_index[0:embed.len_max]
            # 转为one-hot
            label_res = to_categorical(label_index_leave,
                                       num_classes=len(l2i_i2l["l2i"]))
            return que_embed, label_res

        file_csv = open(path, "r", encoding="utf-8")
        cout_all_line = 0
        cnt = 0
        x, y = [], []
        for line in file_csv:
            # 跳出循环
            if len_ql < cout_all_line:
                break
            cout_all_line += 1
            if line.strip():
                # 一个json一个json处理
                # 备注:最好训练前先处理,使得ques长度小于等于len_max(word2vec), len_max-2(bert, albert)
                x_line, y_line = process_line(line, embed, l2i_i2l)
                x.append(x_line)
                y.append(y_line.tolist())
                cnt += 1

        # 通过两种方式处理: 1.嵌入类型(bert, word2vec, random), 2.条件随机场(CRF:'pad', 'reg')类型
        if embedding_type in ['bert', 'albert']:
            x_, y_ = np.array(x), np.array(y)
            x_1 = np.array([x[0] for x in x_])
            x_2 = np.array([x[1] for x in x_])
            x_3 = np.array([x[2] for x in x_])
            if crf_mode == 'pad':
                x_all = [x_1, x_2, x_3]
            elif crf_mode == 'reg':
                x_all = [x_1, x_2]
            else:
                x_all = [x_1, x_2]
        else:
            x_, y_ = np.array(x), np.array(y)
            x_1 = np.array([x[0] for x in x_])
            x_2 = np.array([x[1] for x in x_])
            if crf_mode == 'pad':
                x_all = [x_1, x_2]
            elif crf_mode == 'reg':
                x_all = x_1
            else:
                x_all = x_1
                # 使用fit的时候, return返回
        return x_all, y_
 def __init__(self, path_model_l2i_i2l):
     self.path_model_l2i_i2l = path_model_l2i_i2l
     self.l2i_i2l = None
     if os.path.exists(self.path_model_l2i_i2l):
         self.l2i_i2l = load_json(self.path_model_l2i_i2l)
 def l2i_i2l_init(self):
     """类别与数字项目转化"""
     # l2i_i2l
     path_l2i_i2l = os.path.join(self.path_dir, "l2i_i2l.json")
     self.l2i_i2l = load_json(path_l2i_i2l)
Пример #9
0
 def tokenizer_init(self):
     """字典"""
     # reader tokenizer
     self.token2idx = {}
     path_dict = os.path.join(self.path_dir, "vocab.txt")
     self.token2idx = load_json(path_dict)