Пример #1
0
def load_smn_valid_data(data_fn: str, max_sentence: int, max_utterance: int, max_valid_data_size: int,
                        token_dict: dict = None, tokenizer: tf.keras.preprocessing.text.Tokenizer = None,
                        max_turn_utterances_num: int = 10):
    """
    用于单独加载smn的评价数据,这个方法设计用于能够同时在train时进行评价,以及单独evaluate模式中使用
    注意了,这里token_dict和必传其一,同时传只使用tokenizer
    :param data_fn: 评价数据地址
    :param max_sentence: 最大句子长度
    :param max_utterance: 最大轮次语句数量
    :param max_valid_data_size: 最大验证数据量
    :param token_dict: 字典地址
    :param tokenizer: 分词器实例
    :param max_turn_utterances_num: dataset的批量,最好取单轮对话正负样本数总和的倍数
    :return: dataset
    """
    if not os.path.exists(data_fn):
        return

    history = []
    response = []
    label = []
    with open(data_fn, 'r', encoding='utf-8') as file:
        lines = file.read().strip().split("\n")[:max_valid_data_size]
        for line in lines:
            apart = line.split("\t")
            label.append(int(apart[0]))
            response.append(apart[-1])
            del apart[0]
            del apart[-1]
            history.append(apart)

    if tokenizer is not None:
        response = tokenizer.texts_to_sequences(response)
    else:
        response = dict_texts_to_sequences(response, token_dict)
    response = tf.keras.preprocessing.sequence.pad_sequences(response, maxlen=max_sentence, padding="post")

    utterances = []
    for utterance in history:
        pad_sequences = [0] * max_sentence
        if tokenizer is not None:
            utterance_padding = tokenizer.texts_to_sequences(utterance)[-max_utterance:]
        else:
            utterance_padding = dict_texts_to_sequences(utterance, token_dict)[-max_utterance:]

        utterance_len = len(utterance_padding)
        # 如果当前轮次中的历史语句不足max_utterances数量,需要在尾部进行填充
        if utterance_len != max_utterance:
            utterance_padding += [pad_sequences] * (max_utterance - utterance_len)
        utterances.append(tf.keras.preprocessing.sequence.pad_sequences(utterance_padding, maxlen=max_sentence,
                                                                        padding="post").tolist())

    # 在这里不对数据集进行打乱,方便用于指标计算
    dataset = tf.data.Dataset.from_tensor_slices((utterances, response, label)).prefetch(
        tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(max_turn_utterances_num, drop_remainder=True)

    return dataset
Пример #2
0
def _read_data(data_path: str, num_examples: int, max_length: int,
               tokenizer: tf.keras.preprocessing.text.Tokenizer):
    """
    读取数据,将input和target进行分词后返回
    :param data_path: 分词文本路径
    :param num_examples: 读取的数据量大小
    :param max_length: 最大序列长度
    :param tokenizer: 传入现有的分词器,默认重新生成
    :return: 输入序列张量、目标序列张量和分词器
    """
    (input_lang,
     target_lang), diag_weight = _create_dataset(data_path, num_examples)
    input_tensor = tokenizer.texts_to_sequences(input_lang)
    target_tensor = tokenizer.texts_to_sequences(target_lang)

    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(
        input_tensor, maxlen=max_length, padding='post')
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(
        target_tensor, maxlen=max_length, padding='post')

    return input_tensor, target_tensor, diag_weight
Пример #3
0
def preprocess_request(sentence: str, max_length: int, start_sign: str,
                       tokenizer: tf.keras.preprocessing.text.Tokenizer):
    """
    用于处理回复功能的输入句子,返回模型使用的序列
    :param sentence: 待处理句子
    :param max_length: 单个句子最大长度
    :param start_sign: 开始标记
    :param tokenizer: 分词器
    :return: 处理好的句子和decoder输入
    """
    sentence = " ".join(jieba.cut(sentence))

    inputs = tokenizer.texts_to_sequences([sentence])
    inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                           maxlen=max_length,
                                                           padding='post')
    dec_input = tf.expand_dims([tokenizer.word_index.get(start_sign)], 0)

    return inputs, dec_input
Пример #4
0
def classify_sentence(model: tf.keras.models.Model,
                      tokenizer: tf.keras.preprocessing.text.Tokenizer,
                      sentence: str, max_sequence_len: int) -> int:
    """ Classify sentence according to the trained model.
		Args:
			model:                  classification model for text data
			tokenizer:              tokenizer for this model
			sentence: 				sentence to classify
			max_sequence_len:		sequence len used to train the model

		Returns:
			int          			predicted class
	"""

    sentence = [[word for word in sentence.split() if word not in STOPWORDS]]
    sequence = tokenizer.texts_to_sequences(sentence)
    padded = tf.keras.preprocessing.sequence.pad_sequences(
        sequence, padding="post", maxlen=max_sequence_len)

    return np.argmax(model.predict(padded))
Пример #5
0
def generate_text(model: tf.keras.models.Model,
                  tokenizer: tf.keras.preprocessing.text.Tokenizer,
                  seed_text: str, next_words: int) -> str:
    """ Generate the text starting with seed_text.
		Args:
			model:                  multiclass classification model for text data
			tokenizer:              tokenizer for this model
			seed_text: 				starting sentence
			next_words:				numer of words to generate

		Returns:
			str          			generated text
	"""

    index_to_word = {
        index: word
        for word, index in tokenizer.word_index.items()
    }
    sequence_len = model.layers[0].input_length

    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = np.array(
        tf.keras.preprocessing.sequence.pad_sequences([token_list],
                                                      maxlen=sequence_len,
                                                      padding='pre'))

    for i in range(next_words):

        curr_sequence = token_list[:, i:i + sequence_len]
        predicted = model.predict_classes(curr_sequence, verbose=0)
        token_list = np.append(token_list,
                               np.reshape(predicted, (1, 1)),
                               axis=1)

    generated_text = [
        index_to_word[index] for index in token_list[0] if index > 0
    ]

    return " ".join(generated_text)
Пример #6
0
def create_n_grams(corpus: List[List[str]],
                   tokenizer: tf.keras.preprocessing.text.Tokenizer,
                   sequence_len: int,
                   max_docs: Optional[int]) -> Tuple[np.ndarray, np.ndarray]:
    """ Split the corpus into n-grams. Shorter sequences will be padded.
		Args:
			corpus:					texts used to create n-grams
			tokenizer: 				fitted tokenizer
			sequence_len: 			n in the n-grams
			max_docs(optional):     maximum number of documents to take from corpus

		Returns:
			np.ndarray: 			predictors, sequences of [i:i+n-1]
			np.ndarray				predictands, sequence of [i+n]
	"""

    input_sequences = []

    for j, phrase in enumerate(corpus):
        token_list = tokenizer.texts_to_sequences([phrase])[0]
        if len(token_list) < 2:
            continue
        elif len(token_list) < sequence_len:
            input_sequences.append(token_list)

        for i in range(1, len(token_list) - sequence_len):
            n_gram_sequence = token_list[i:i + sequence_len]
            input_sequences.append(n_gram_sequence)

        if max_docs and j > max_docs:
            break

    input_sequences = np.array(
        tf.keras.preprocessing.sequence.pad_sequences(input_sequences,
                                                      maxlen=sequence_len,
                                                      padding='pre'))
    predictors, predictands = input_sequences[:, :-1], input_sequences[:, -1]

    return predictors, predictands