Пример #1
0
def rnn_layer(units: int, input_feature_dim: int, cell_type: str = 'lstm',
              if_bidirectional: bool = True) -> tf.keras.Model:
    """
    RNNCell层,其中可定义cell类型,是否双向
    :param units: cell单元数
    :param input_feature_dim: 输入的特征维大小
    :param cell_type: cell类型,lstm/gru, 默认lstm
    :param if_bidirectional: 是否双向
    :return: Multi-layer RNN
    """
    inputs = tf.keras.Input(shape=(None, input_feature_dim))
    if cell_type == 'lstm':
        rnn = tf.keras.layers.LSTM(units=units, return_sequences=True, return_state=True,
                                   recurrent_initializer='glorot_uniform')
    elif cell_type == 'gru':
        rnn = tf.keras.layers.GRU(units=units, return_sequences=True, return_state=True,
                                  recurrent_initializer='glorot_uniform')
    else:
        print('cell执行了类型执行出错,定位细节参见log')
        utils.log_operator(level=10).info("cell执行了类型执行出错")

    if if_bidirectional:
        rnn = tf.keras.layers.Bidirectional(rnn)

    rnn_outputs = rnn(inputs)
    outputs = rnn_outputs[0]
    states = outputs[:, -1, :]

    return tf.keras.Model(inputs=inputs, outputs=[outputs, states])
Пример #2
0
    def __init__(self, execute_type: str, checkpoint_dir: str, num_layers: int,
                 units: int, d_model: int, num_heads: int, dropout: float,
                 start_sign: str, end_sign: str, beam_size: int,
                 vocab_size: int, dict_fn: str, max_length: int):
        """
        Transformer聊天器初始化,用于加载模型
        :param execute_type: 对话执行模式
        :param checkpoint_dir: 检查点保存目录路径
        :param num_layers: transformer内部层数
        :param units: 单元数
        :param d_model: 嵌入层维度
        :param num_heads: 注意力头数
        :param dropout: 采样率
        :param start_sign: 开始标记
        :param end_sign: 结束标记
        :param beam_size: batch大小
        :param vocab_size: 词汇量大小
        :param dict_fn: 保存字典路径
        :param max_length: 单个句子最大长度
        :return: 无返回值
        """
        super().__init__(checkpoint_dir, beam_size, max_length, dict_fn,
                         start_sign, end_sign)

        self.model = transformer.transformer(vocab_size=vocab_size,
                                             num_layers=num_layers,
                                             units=units,
                                             d_model=d_model,
                                             num_heads=num_heads,
                                             dropout=dropout)
        self.learning_rate = optimizers.CustomSchedule(d_model)
        self.optimizer = tf.keras.optimizers.Adam(self.learning_rate,
                                                  beta_1=0.9,
                                                  beta_2=0.98,
                                                  epsilon=1e-9)
        self.train_loss = tf.keras.metrics.Mean(name='train_loss')
        self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            name='train_accuracy')

        self.checkpoint = tf.train.Checkpoint(transformer=self.model,
                                              optimizer=self.optimizer)

        print('正在检查是否存在检查点')
        if self.ckpt:
            print('存在检查点,正在加载检查点')
            self.checkpoint.restore(
                tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()
        else:
            if execute_type == "train":
                print('不存在检查点,正在train模式')
            else:
                print('不存在检查点,请先执行train模式,再进入chat模式')
                exit(0)

        log_operator(level=10).info(
            "启动Transformer聊天器,执行类别为:{},模型参数配置为:num_layers:{},"
            "d_model:{},num_heads:{},units:{},dropout:{},vocab_size:{},"
            "max_length:{}".format(execute_type, num_layers, d_model,
                                   num_heads, units, dropout, vocab_size,
                                   max_length))
Пример #3
0
def combine_tokenized_data_single(standby_data: list, combine_data: str, if_remove: bool = True):
    """
    *单轮对话数据集处理模块*
    将所有已经分词好的问答对集中整合到一个文件中
    :param standby_data: 分词好的数据文本路径
    :param combine_data: 汇总数据的文本路径
    :param if_remove: 是否移除原有分词文本
    :return: 无返回值
    """
    if os.path.exists(combine_data) and if_remove:
        os.remove(combine_data)

    count = 0
    file_count = 0

    for file_fn in standby_data:
        if not os.path.exists(file_fn):
            print("{}文件不存在,请检查之后再次运行".format(file_fn))
            exit(0)
        with open(file_fn, 'r', encoding='utf-8') as tokenized_file, open(combine_data, 'a',
                                                                          encoding='utf-8') as combine_file:
            for line in tokenized_file:
                line = line.strip().strip("\n").replace("/", " ")
                combine_file.write(line + "\n")
                count += 1
                print("\r数据处理进度:{}".format(count), flush=True, end="")

        file_count += 1

    message = "数据处理完毕,数据信息统计:共处理{}个分词文件,整理出{}条数据".format(file_count, count)
    print("\n" + message)
    logger = log_operator(level=10)
    logger.info(message)
Пример #4
0
def to_single_turn_dataset(tokenized_data_path: str,
                           qa_data_path: str,
                           remove_tokenized: bool = True):
    """生成单轮对话数据集

    用于处理已经分词好的多轮次数据集的方法,将数据集处理成问答对的形式
    :param tokenized_data_path: 已切分多轮对话数据路径
    :param qa_data_path: 单轮对话数据保存路径
    :param remove_tokenized: 是否移除原有分词文本
    :return: 无返回值
    """
    # _check_file(raw_file=raw_data_path, processed_file=qa_data_path, remove_tokenized=remove_tokenized)

    count = 0
    sentences_count = 0
    max_len = 0
    min_len = 10000
    sentence_len = []
    one_pair = []

    # 对每一轮对话上下文进行配对,形成一问一答两个部分,如果遇到下一轮对话,直接跳过
    with open(tokenized_data_path, encoding="utf-8") as raw_file, \
            open(qa_data_path, 'w', encoding="utf-8") as single_turn_data_file:
        for line in raw_file:
            line = line.strip('\n').replace('/', '')
            # line = re.sub(r"[%s]+" % punctuation, "", line)
            # 因为原始数据集中,是一轮一轮的对话排列的,所以需要注意的是
            # 在一轮对话结束之后,最后一句不能作为问句,需要跳到下一轮进行处理
            if line == '':
                one_pair = []
                count += 1
                continue
            elif len(one_pair) == 1:
                one_pair.append(line)
                single_turn_data_file.write(one_pair[0] + "\t" + one_pair[1] +
                                            "\n")
                one_pair = [line]
                sentences_count += 1
                if sentences_count % 10000 == 0:
                    print('已处理:', sentences_count, '个问答对')
            else:
                one_pair.append(line)

            length = len(line)
            max_len = max(max_len, length)
            min_len = min(min_len, length)
            sentence_len.append(length)

    message = "对话数据集转换完毕:共处理{}轮对话数据,整理出{}对" \
              "问答对,语句最大长度:{},语句最短长度{},语句平均长度{:.3f}".format(count, sentences_count,
                                                           max_len, min_len, np.mean(sentence_len))
    print(message)
    logger = log_operator(level=10)
    logger.info(message)
Пример #5
0
def preprocess_raw_douban_data(raw_data: str,
                               tokenized_data: str,
                               repeat_data: int = 10,
                               if_remove: bool = True):
    """
    用于处理douban数据集的方法,将douban数据集处理成多轮次对话的形式,并分词
    :param raw_data: 原始数据路径
    :param tokenized_data: 生成token数据保存路径
    :param repeat_data: 每轮对话重复数据条数
    :param if_remove: 是否移除原有分词文本
    :return: 无返回值
    """
    _check_file(raw_file=raw_data,
                processed_file=tokenized_data,
                remove_tokenized=if_remove)

    count = 0
    max_len = 0
    min_len = 10000
    sentence_len = []

    with open(raw_data, 'r', encoding='utf-8') as raw_file, open(
            tokenized_data, 'a', encoding='utf-8') as tokenized_file:
        iter_count = -1
        for line in raw_file:
            iter_count += 1
            if iter_count % repeat_data != 0:
                continue
            line = line.strip('\n').replace('/', '')
            if line == "":
                continue

            # 因为原始数据集中,是一轮一轮的对话排列的,所以需要注意的是在一轮对话结束之后,最后
            # 一句不能作为问句,需要跳到下一轮进行处理去掉最前面的标签和最后面的不正确语句
            utterances = line.split('\t')[1:-1]
            for utterance in utterances:
                length = len(utterance)
                sentence_len.append(length)
                max_len = max(max_len, length)
                min_len = min(min_len, length)
                tokenized_file.write(utterance + "\n")
            tokenized_file.write("\n")
            count += 1
            if count % 10000 == 0:
                print("数据处理进度:{}".format(count))

    message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \
              "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len))

    print(message)
    logger = log_operator(level=10)
    logger.info(message)
Пример #6
0
def preprocess_raw_wei_bo_data(raw_post_data: str,
                               raw_response_data,
                               tokenized_data: str,
                               if_remove: bool = True):
    """
    用于处理weibo数据集的方法,将weibo数据集处理成多轮次的形式,并分词
    :param raw_post_data: 微博的post原始文本数据中的路径
    :param raw_response_data: 微博的response原始文本数据中的路径
    :param tokenized_data: 生成token数据保存路径
    :param if_remove: 是否移除原有分词文本
    :return: 无返回值
    """
    _check_file(raw_file=raw_post_data,
                processed_file=tokenized_data,
                remove_tokenized=if_remove)
    if not os.path.exists(raw_response_data):
        print('数据集不存在,请添加数据集!')
        exit(0)

    count = 0
    max_len = 0
    min_len = 10000
    sentence_len = []

    with open(raw_post_data, 'r', encoding='utf-8') as post_file, open(
            raw_response_data, 'r', encoding='utf-8') as response_file, \
            open(tokenized_data, 'a', encoding='utf-8') as tokenized_file:
        for post_data, response_data in zip(post_file, response_file):
            post_data = post_data.strip("\n").replace("/", " ")
            response_data = response_data.strip("\n").replace("/", " ")
            if post_data == "" or response_data == "":
                continue

            post_len = len(post_data)
            response_len = len(response_data)
            max_len = max(max_len, post_len, response_len)
            min_len = min(min_len, post_len, response_len)
            sentence_len.append(post_len)
            sentence_len.append(response_len)
            tokenized_file.write(post_data + "\n" + response_data + "\n\n")

            count += 1
            if count % 10000 == 0:
                print("已读取:{}轮对话数据".format(count))

    message = "数据处理完毕:共处理{}轮对话数据,语句最大长度:{},语" \
              "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len))

    print(message)
    logger = log_operator(level=10)
    logger.info(message)
Пример #7
0
def preprocess_raw_qin_yun_data(raw_data: str,
                                tokenized_data: str,
                                if_remove: bool = True):
    """
    用于处理青云数据集的方法,将青云数据集处理成多轮次的形式,并分词
    :param raw_data: 原始数据路径
    :param tokenized_data: 生成token数据保存路径
    :param if_remove: 是否移除原有分词文本
    :return: 无返回值
    """
    _check_file(raw_file=raw_data,
                processed_file=tokenized_data,
                remove_tokenized=if_remove)

    count = 0
    max_len = 0
    min_len = 10000
    sentence_len = []

    with open(raw_data, 'r', encoding='utf-8') as raw_file, open(
            tokenized_data, 'a', encoding='utf-8') as tokenized_file:
        for line in raw_file:
            line = line.strip().strip("\n").replace("/", " ")
            if line == "":
                continue

            for sentence in line.split("|"):
                sentence = sentence.strip()

                length = len(sentence)
                sentence_len.append(length)
                max_len = max(max_len, length)
                min_len = min(min_len, length)
                tokenized_file.write(" ".join(jieba.cut(sentence)) + "\n")
            tokenized_file.write("\n")

            count += 1
            if count % 10000 == 0:
                print("已读取:{}轮对话数据".format(count))

    message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \
              "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len))

    print(message)
    logger = log_operator(level=10)
    logger.info(message)
Пример #8
0
def preprocess_raw_lccc_data(raw_data_path: str,
                             tokenized_data_path: str,
                             remove_tokenized: bool = True):
    """将LCCC数据集从JSON格式转换每行一条话语

    LCCC原始数据集已分词.

    :param raw_data_path: 原始数据路径
    :param tokenized_data_path: 生成token数据保存路径
    :param remove_tokenized: 是否移除原有分词文本
    :return: 无返回值
    """
    _check_file(raw_file=raw_data_path,
                processed_file=tokenized_data_path,
                remove_tokenized=remove_tokenized)

    count = 0
    max_len = 0
    min_len = 10000
    sentence_len = []

    with open(raw_data_path, 'r', encoding="utf-8") as raw_file, open(
            tokenized_data_path, 'a', encoding="utf-8") as tokenized_file:
        raw_data_path = json.load(raw_file)
        for data in raw_data_path:
            for sentence in data:
                length = len(sentence)
                sentence_len.append(length)
                max_len = max(max_len, length)
                min_len = min(min_len, length)
                tokenized_file.write(sentence + "\n")

            tokenized_file.write("\n")
            count += 1
            if count % 10000 == 0:
                print("已读取:{}轮对话数据".format(count))

    message = "数据预处理完毕:共处理{}轮对话数据,语句最大长度:{},语" \
              "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len))

    print(message)
    logger = log_operator(level=10)
    logger.info(message)
Пример #9
0
def preprocess_raw_xiao_huang_ji_data(raw_data: str,
                                      tokenized_data: str,
                                      if_remove: bool = True):
    """
    用于处理小黄鸡数据集的方法,将小黄鸡数据集处理成多轮次对话的形式,并分词
    :param raw_data: 原始数据路径
    :param tokenized_data: 生成token数据保存路径
    :param if_remove: 是否移除原有分词文本
    :return:
    """
    _check_file(raw_file=raw_data,
                processed_file=tokenized_data,
                remove_tokenized=if_remove)

    count = 1
    max_len = 0
    min_len = 10000
    sentence_len = []

    with open(raw_data, 'r', encoding="utf-8") as raw_file, open(
            tokenized_data, 'a', encoding="utf-8") as tokenized_file:
        for line in raw_file:
            line = line.strip('\n').replace('/', '')
            if line == "":
                tokenized_file.write("\n")
                count += 1
                if count % 10000 == 0:
                    print("已读取:{}轮对话数据".format(count))
                continue

            length = len(line)
            sentence_len.append(length)
            max_len = max(max_len, length)
            min_len = min(min_len, length)
            tokenized_file.write(" ".join(jieba.cut(line)) + "\n")

    message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \
              "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len))

    print(message)
    logger = log_operator(level=10)
    logger.info(message)
Пример #10
0
def preprocess_raw_cross_woz_data(raw_data: str,
                                  tokenized_data: str,
                                  if_remove: bool = True):
    """
    用于处理crossWOZ数据集的方法,将crossWOZ数据集处理成多轮次对话的形式,并分词
    :param raw_data: 原始数据路径
    :param tokenized_data: 生成token数据保存路径
    :param if_remove: 是否移除原有分词文本
    :return: 无返回值
    """
    _check_file(raw_file=raw_data,
                processed_file=tokenized_data,
                remove_tokenized=if_remove)

    count = 0
    max_len = 0
    min_len = 10000
    sentence_len = []

    with open(raw_data, 'r', encoding='utf-8') as raw_file, open(
            tokenized_data, 'a', encoding='utf-8') as tokenized_file:
        raw_data = json.load(raw_file)
        for data in raw_data:
            turn_utterances = raw_data[data]["messages"]
            for content in turn_utterances:
                sentence = content["content"]
                length = len(sentence)
                sentence_len.append(length)
                max_len = max(max_len, length)
                min_len = min(min_len, length)
                tokenized_file.write(" ".join(jieba.cut(sentence)) + "\n")
            tokenized_file.write("\n")
            count += 1
            if count % 10000 == 0:
                print("已读取:{}轮对话数据".format(count))

    message = "数据处理完毕,数据信息统计:共处理{}轮对话数据,语句最大长度:{},语" \
              "句最短长度{},语句平均长度{:.3f}".format(count, max_len, min_len, np.mean(sentence_len))

    print(message)
    logger = log_operator(level=10)
    logger.info(message)
Пример #11
0
    def __init__(self,
                 execute_type: str,
                 checkpoint_dir: str,
                 units: int,
                 embedding_dim: int,
                 batch_size: int,
                 start_sign: str,
                 end_sign: str,
                 beam_size: int,
                 vocab_size: int,
                 dict_fn: str,
                 max_length: int,
                 encoder_layers: int,
                 decoder_layers: int,
                 cell_type: str,
                 if_bidirectional: bool = True):
        """
        Seq2Seq聊天器初始化,用于加载模型
        :param execute_type: 对话执行模式
        :param checkpoint_dir: 检查点保存目录路径
        :param units: 单元数
        :param embedding_dim: 嵌入层维度
        :param batch_size: batch大小
        :param start_sign: 开始标记
        :param end_sign: 结束标记
        :param beam_size: batch大小
        :param vocab_size: 词汇量大小
        :param dict_fn: 保存字典路径
        :param max_length: 单个句子最大长度
        :param encoder_layers: encoder中内部RNN层数
        :param decoder_layers: decoder中内部RNN层数
        :param cell_type: cell类型,lstm/gru, 默认lstm
        :param if_bidirectional: 是否双向
        :return: 无返回值
        """
        super().__init__(checkpoint_dir, beam_size, max_length, dict_fn,
                         start_sign, end_sign)
        self.units = units
        self.batch_size = batch_size
        self.enc_units = units

        self.encoder = seq2seq.encoder(vocab_size=vocab_size,
                                       embedding_dim=embedding_dim,
                                       enc_units=int(units / 2),
                                       num_layers=encoder_layers,
                                       cell_type=cell_type,
                                       if_bidirectional=if_bidirectional)
        self.decoder = seq2seq.decoder(vocab_size=vocab_size,
                                       embedding_dim=embedding_dim,
                                       enc_units=units,
                                       dec_units=units,
                                       num_layers=decoder_layers,
                                       cell_type=cell_type)

        self.optimizer = tf.keras.optimizers.Adam()
        self.train_loss = tf.keras.metrics.Mean()
        self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
        self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction='none')
        self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer,
                                              encoder=self.encoder,
                                              decoder=self.decoder)

        print('正在检查是否存在检查点')
        if self.ckpt:
            print('存在检查点,正在加载检查点')
            self.checkpoint.restore(
                tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()
        else:
            if execute_type == "train":
                print('不存在检查点,从头开始训练')
            else:
                print('不存在检查点,请先执行train模式,再进入chat模式')
                exit(0)

        log_operator(level=10).info(
            "启动SMN聊天器,执行类别为:{},模型参数配置为:vocab_size:{},"
            "embedding_dim:{},units:{},max_length:{}".format(
                execute_type, vocab_size, embedding_dim, units, max_length))
Пример #12
0
    def __init__(self, units: int, vocab_size: int, execute_type: str,
                 dict_fn: str, embedding_dim: int, checkpoint_dir: int,
                 max_utterance: int, max_sentence: int, learning_rate: float,
                 database_fn: str, solr_server: str):
        """
        SMN聊天器初始化,用于加载模型
        :param units: 单元数
        :param vocab_size: 词汇量大小
        :param execute_type: 对话执行模式
        :param dict_fn: 保存字典路径
        :param embedding_dim: 嵌入层维度
        :param checkpoint_dir: 检查点保存目录路径
        :param max_utterance: 每轮句子数量
        :param max_sentence: 单个句子最大长度
        :param learning_rate: 学习率
        :param database_fn: 候选数据库路径
        :return: 无返回值
        """
        self.dict_fn = dict_fn
        self.checkpoint_dir = checkpoint_dir
        self.max_utterance = max_utterance
        self.max_sentence = max_sentence
        self.database_fn = database_fn
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        self.solr = pysolr.Solr(url=solr_server,
                                always_commit=True,
                                timeout=10)
        self.train_loss = tf.keras.metrics.Mean()

        self.model = smn.smn(units=units,
                             vocab_size=vocab_size,
                             embedding_dim=embedding_dim,
                             max_utterance=self.max_utterance,
                             max_sentence=self.max_sentence)

        self.checkpoint = tf.train.Checkpoint(
            model=self.model,
            optimizer=self.optimizer,
        )

        ckpt = os.path.exists(checkpoint_dir)
        if not ckpt:
            os.makedirs(checkpoint_dir)

        print('正在检查是否存在检查点')
        if ckpt:
            print('存在检查点,正在加载检查点'.format(checkpoint_dir))
            self.checkpoint.restore(
                tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()
        else:
            if execute_type == "train":
                print('不存在检查点,正在train模式')
            else:
                print('不存在检查点,请先执行train模式,再进入chat模式')
                exit(0)

        logger = utils.log_operator(level=10)
        logger.info("启动SMN聊天器,执行类别为:{},模型参数配置为:embedding_dim:{},"
                    "max_sentence:{},max_utterance:{},units:{},vocab_size:{},"
                    "learning_rate:{}".format(execute_type, embedding_dim,
                                              max_sentence, max_utterance,
                                              units, vocab_size,
                                              learning_rate))
Пример #13
0
def to_single_turn_dataset(tokenized_data_path: str, qa_data_path: str, dict_path: str, vocab_size: int,
                           start_sign: str = "<start>", end_sign: str = "<end>", unk_sign: str = "<unk>",
                           max_data_size: int = 0, remove_tokenized: bool = True):
    """生成单轮对话数据集

    用于处理已经分词好的多轮次数据集的方法,将数据集处理成问答对的形式
    :param tokenized_data_path: 已切分多轮对话数据路径
    :param qa_data_path: 单轮对话数据保存路径
    :param dict_path: 字典保存路径
    :param vocab_size: 词汇量大小
    :param start_sign: 开始标记
    :param end_sign: 结束标记
    :param unk_sign: 未登录词
    :param max_data_size: 最大加载数据量,,0为所有数据
    :param remove_tokenized: 是否移除原有分词文本
    :return: 无返回值
    """
    # _check_file(raw_file=raw_data_path, processed_file=qa_data_path, remove_tokenized=remove_tokenized)

    count = 0
    sentences_count = 0
    max_len = 0
    min_len = 10000
    sentence_len = []
    one_pair = []
    all_text_list = []

    # 对每一轮对话上下文进行配对,形成一问一答两个部分,如果遇到下一轮对话,直接跳过
    with open(tokenized_data_path, encoding="utf-8") as raw_file, \
            open(qa_data_path, 'w', encoding="utf-8") as single_turn_data_file:
        for line in raw_file:
            line = line.strip('\n').replace('/', '')
            # line = re.sub(r"[%s]+" % punctuation, "", line)
            # 因为原始数据集中,是一轮一轮的对话排列的,所以需要注意的是
            # 在一轮对话结束之后,最后一句不能作为问句,需要跳到下一轮进行处理
            if line == '':
                one_pair = []
                count += 1
                continue
            elif len(one_pair) == 1:
                one_pair.append(line)
                question = start_sign + " " + one_pair[0] + " " + end_sign
                answer = start_sign + " " + one_pair[1] + " " + end_sign
                single_turn_data_file.write(question + "\t" + answer + "\n")
                all_text_list.append(question)
                all_text_list.append(answer)
                one_pair = [line]
                sentences_count += 1
                print('\r已处理:{}个问答对'.format(sentences_count), flush=True, end="")
                if sentences_count == max_data_size:
                    break
            else:
                one_pair.append(line)

            length = len(line)
            max_len = max(max_len, length)
            min_len = min(min_len, length)
            sentence_len.append(length)

    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters="", num_words=vocab_size, oov_token=unk_sign)
    tokenizer.fit_on_texts(all_text_list)
    with open(dict_path, 'w', encoding='utf-8') as dict_file:
        dict_file.write(tokenizer.to_json())

    message = "对话数据集转换完毕,并保存字典:共处理{}轮对话数据,整理出{}对" \
              "问答对,语句最大长度:{},语句最短长度{},语句平均长度{:.3f}".format(count, sentences_count,
                                                           max_len, min_len, np.mean(sentence_len))
    print("\n" + message)
    logger = log_operator(level=10)
    logger.info(message)