Пример #1
0
    def load(self):
        # Load vocabulary
        self.vocab_words = load_vocab(self.output_words)
        self.vocab_tags = load_vocab(self.output_tags)
        self.vocab_chars = load_vocab(self.output_chars)

        self.nwords = len(self.vocab_words)
        self.nchars = len(self.vocab_chars)
        self.ntags = len(self.vocab_tags)

        # Get processing words
        self.processing_word = get_processing_word(self.vocab_words,self.vocab_chars, lowercase=True, chars=self.use_chars)
        self.processing_tag = get_processing_word(self.vocab_tags, lowercase=False, allow_unk=False)

        # Get pre-trained embeddings
        self.embeddings = (get_trimmed_glove_vectors(self.output_trimmed)
                           if self.use_pretrained else None)
Пример #2
0
    def __init__(self):
        self.vocabulary, self.vocabulary_reverse = load_vocab(args.data_path)
        tf.reset_default_graph()
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        session = tf.Session(config=tf_config)

        with tf.name_scope("Train"):
            with tf.variable_scope("Model"):
                self.model = SEQ2SEQ(session, options, "predict")
        self.model.restore(os.path.join(args.root_path, args.restore_path))
Пример #3
0
def create_dataset(config):
    dataset_class = None
    dataset_params_train = {}
    dataset_params_val = {}
    dataset_params_test = {}

    def add_dataset_param(name, value_train, value_dev=None, value_test=None):
        dataset_params_train[name] = value_train
        dataset_params_val[
            name] = value_dev if value_dev is not None else value_train
        dataset_params_test[
            name] = value_test if value_test is not None else value_train

    titles_train, titles_val, titles_test = load_titles()
    add_dataset_param('titles', titles_train, titles_val, titles_test)

    if config.model == 'counts' or config.model == 'counts_deep':
        dataset_class = BiosCountsDataset

        bios_train, bios_val, bios_test = load_bios_counts(config.scrubbed)
        add_dataset_param('bios', bios_train, bios_val, bios_test)

        features_names = load_features_names(config.scrubbed)
        add_dataset_param('feature_names', features_names)
    elif config.model == 'han' or config.model == 'rnn':
        dataset_class = BiosSeqDataset

        bios_train, bios_val, bios_test = load_bios_seq(config.scrubbed)
        add_dataset_param('bios', bios_train, bios_val, bios_test)

        vocab = load_vocab(config.scrubbed)
        add_dataset_param('vocab', vocab)
    else:
        raise ValueError(f'Dataset for the model {config.model} is unknown')

    dataset_train = dataset_class(**dataset_params_train)
    dataset_val = dataset_class(**dataset_params_val)
    dataset_test = dataset_class(**dataset_params_test)

    print(
        f'Dataset: {type(dataset_train).__name__} - {len(dataset_train)}, {len(dataset_val)}, {len(dataset_test)}'
    )
    return dataset_train, dataset_val, dataset_test
Пример #4
0
def read_test_data():
    with open(args.test_file, "r", encoding="utf-8") as f:
        examples, responses = [], []
        for line in f:
            example = line.split("\t")[-6:]
            example = [
                s.lower().split()[:args.max_utterance_len] for s in example
            ]
            examples.append(example)
            responses.append(example[-1])
    return examples, responses


if __name__ == "__main__":
    vocabulary, vocabulary_reverse = load_vocab(args.data_path)
    examples, responses = read_test_data()

    tf.reset_default_graph()
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    session = tf.Session(config=tf_config)

    with tf.name_scope("Train"):
        with tf.variable_scope("Model"):
            model = SEQ2SEQ(session, options, "predict")
    model.restore(os.path.join(args.root_path, args.restore_path))

    num_examples = len(examples)
    num_batches = num_examples // options.batch_size
    predict_responses = []
Пример #5
0
	def __init__(self, is_training=True):
		self.graph = Graph()
		with self.graph as_default():
			if is_training:
				self.x, self.y = get_batch_data()
			else:
				self.x = tf.placeholder(tf.int32, shape=(None, pre.maxlen))
				self.y = tf.placeholder(tf.int32, shape=(None, pre.maxlen))

			self.decoder_input = tf.concat((tf.ones_like(self.y[:,:1])*2,self.y[:,:-1]),axis=-1)
			en2id, id2en = load_vocab("./data/en_words.txt")
			ch2id, id2ch = load_vocab("./data/ch_words.txt")

			with tf.variable_scope("encoder"):
				self.enc = embedding(self.x, vocab_size=len(en2id), num_units=pre.embedding)
				key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.enc),axis=-1)), -1)
				self.enc = self.enc + position_embedding(self.x, embedding_dim=pre.embedding)
				self.enc = self.enc*key_masks
				self.enc = tf.layers.dropout(self.enc, rate=0.1, training=tf.convert_to_tensor(is_training))

				for i in range(6):
					self.enc = multihead_attention(queries=self.enc,
												keys=self.enc,
												embedding_dim=pre.embedding,
												num_head=8,
												dropout_rate=0.1,
												is_training=is_training,
												future_blind=False)
					self.enc = feedforward(inputs=self.enc)

			with tf.variable_scope("decode"):
				self.dec = embedding(inputs=self.y, vocab_size=len(ch2id), embedding_dim=pre.embedding)
				key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)),-1)
				self.dec = self.dec + position_embedding(self.y, embedding_dim=pre.embedding)
				self.dec = self.dec*key_masks
				self.dec = tf.layers.dropout(self.dec, rate=0.1, training=tf.convert_to_tensor(is_training))

				for i in range(6):
					self.dec = multihead_attention(queries=self.dec,
													keys=self.dec,
													embedding_dim=pre.embedding,
													num_head=8,
													dropout_rate=0.1,
													is_training=is_training,
													future_blind=True)
					self.dec = multihead_attention(queries=self.dec,
													keys=self.enc,
													embedding_dim=pre.embedding,
													num_head=8,
													dropout_rate=0.1,
													is_training=is_training,
													future_blind=False)
					self.dec = feedforward(self.dec)

			self.logits = tf.layers.dense(self.dec, len(ch2id))
			self.preds = tf.to_int32(tf.argmax(self.logits,axis=-1))
			self.istarget = tf.to_float(tf.not_equal(self.y, 0))
			self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds,self.y))*self.istarget)/(tf.reduce_sum(self.istarget))
			if is_training:
				self.y_smoothed = label_smoothing(tf.onehot(self.y, depth=len(ch2id)))
				self.loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.y, logits=self.logits)
				self.mean_loss = tf.reduce_sum(self.loss*self.istarget)/tf.reduce_sum(self.istarget)
				self.optimizer = tf.train.AdamOptimizer(learning_rate=0.0001,
														beta1=0.9,
														beta2=0.98,
														epsilon=1e-8)
				self.opt = self.optimizer.minimize(self.mean_loss)
Пример #6
0
    dataset_dev = DatasetHandler(config.conll_dev, target_word)
    dataset_test = DatasetHandler(config.conll_test, target_word)
    dataset_train = DatasetHandler(config.conll_train, target_word)

    # Build Word and Tag vocab
    vocab_words, vocab_tags = get_vocabs(
        [dataset_train, dataset_dev, dataset_test])
    vocab_glove = get_glove_vocab(config.output_glove)
    vocab = vocab_words & vocab_glove
    vocab.add(UNK)
    vocab.add(NUM)

    # Process vocab
    write_vocab(vocab, config.output_words)
    write_vocab(vocab_tags, config.output_tags)
    vocab = load_vocab(config.output_words)
    export_trimmed_glove_vectors(vocab, config.output_glove,
                                 config.output_trimmed, config.dim_word)

    # Build and save char vocab
    train = DatasetHandler(config.conll_train)
    vocab_chars = get_char_vocab(train)
    write_vocab(vocab_chars, config.output_chars)

    # build model
    train_config = Config()
    model = BilstmModel(train_config)
    model.build()

    # create datasets
    dev = DatasetHandler(train_config.conll_dev, train_config.processing_word,
Пример #7
0
    num_hidden_layers = CONFIG["num_hidden_layers"]
    embedding_size = CONFIG["embedding_size"]
    char_embedding_size = CONFIG.get("char_embedding_size", 100)
    nb_filters = CONFIG.get("nb_filters", 10)
    hidden_layer_size = CONFIG["hidden_layer_size"]
    RNN_LAYER_TYPE = CONFIG.get("RNN_LAYER_TYPE", "LSTM")
    optimizer = CONFIG["optimizer"]
    n_epochs = CONFIG["n_epochs"] + base_epochs
    save_every = CONFIG["save_every"]
    model_type = CONFIG.get("model_type", "rnn") # rnn, brnn

    RNN_CLASS = LSTM
    if RNN_LAYER_TYPE == "GRU":
        RNN_CLASS = GRU

    index_word, word_dict = pp.load_vocab(vocab_file)
    char_dict = {}
    if char_vocab_file is not None:
        index_char, char_dict = pp.load_vocab(char_vocab_file)
        char_vocab_size = len(index_char) + 2 # Add offset for OOV and padding
    pp.WordToken.set_vocab(word_dict = word_dict, char_dict = char_dict)
    index_labels, labels_dict = pp.load_vocab(labels_file)
    index_boundary, boundary_dict = pp.load_vocab(boundary_file)
    index_category, category_dict = pp.load_vocab(category_file)
    vocab_size = len(index_word) + pp.WordToken.VOCAB + 1 # Add offset of VOCAB and then extra token for padding
    labels_size = len(index_labels) + 1 # Add extra token for padding
    boundary_size = len(index_boundary) + 1 # Add extra token for padding 
    category_size = len(index_category) + 1 # Add extra token for padding

    logger.info("Parameters: vocab_size = %s, label_type = %s, labels_size = %s, embedding_size = %s, maxlen = %s, boundary_size = %s, category_size = %s, embedding_size = %s, hidden_layer_size = %s" %\
                    (vocab_size, label_type, labels_size, embedding_size, maxlen, boundary_size, category_size, embedding_size, hidden_layer_size))
# -*- coding: utf-8 -*-
# @Time : 2020/8/21 下午2:34
# @Author : chezhonghao
# @description : 
# @File : predict_service.py
# @Software: PyCharm
import os
os.environ['CUDA_VISIBLE_DEVICES']="-1"
from base_transformer_model.config import Config
from preprocess import tfrecord_reader,encode_sentence,load_vocab
from base_transformer_model.component import create_look_ahead_mask,create_padding_mask
import tensorflow as tf
model=tf.saved_model.load('./saved_transformer_model',tags=[tf.saved_model.SERVING])
# inference=base_transformer_model.signatures['serving_default']
t2i, i2t = load_vocab('./transformer_model/vocab.txt')
def create_mask(dialogue, question, report):
    # 编码器填充遮挡
    dialogue_padding_mask = create_padding_mask(dialogue)
    # 在解码器的第二个注意力模块使用。
    # 该填充遮挡用于遮挡编码器的输出。
    question_padding_mask = create_padding_mask(question)
    # 在解码器的第一个注意力模块使用。
    # 用于填充(pad)和遮挡(mask)解码器获取到的输入的后续标记(future tokens)。
    look_ahead_mask = create_look_ahead_mask(tf.shape(report)[1])
    dec_target_padding_mask = create_padding_mask(report)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
    return dialogue_padding_mask, combined_mask, question_padding_mask
def predict(inp_sentence,question,report):
    dialogue=encode_sentence(mode='dialogue',sentence=inp_sentence,token2id=t2i,maxlen=Config.max_len['dialogue'])
    question = encode_sentence(mode='question', sentence=question, token2id=t2i, maxlen=Config.max_len['question'])
    report=encode_sentence(mode='tar_report_input',sentence='',token2id=t2i,maxlen=Config.max_len['report'])

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=100000):
        super(CustomSchedule, self).__init__()
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


t2i, i2t = load_vocab(Config.vocab_path)
model.load_weights('../train_transformer_saved_weights/')


def create_mask(dialogue, question, report):
    # 编码器填充遮挡
    dialogue_padding_mask = create_padding_mask(dialogue)
    # 在解码器的第二个注意力模块使用。
    # 该填充遮挡用于遮挡编码器的输出。
    question_padding_mask = create_padding_mask(question)
    # 在解码器的第一个注意力模块使用。
    # 用于填充(pad)和遮挡(mask)解码器获取到的输入的后续标记(future tokens)。
    look_ahead_mask = create_look_ahead_mask(tf.shape(report)[1])
    dec_target_padding_mask = create_padding_mask(report)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
    return dialogue_padding_mask, combined_mask, question_padding_mask
Пример #10
0
    def embedding_layer(self):
        context_emb = None
        query_emb = None
        reply_emb = None
        if self.config.use_word_embeddings:
            self.embedding_dim += self.config.word_embeddings_dim
            context_word_input = Input(shape=(self.config.max_utterance_num *
                                              self.config.max_utterance_len, ),
                                       dtype='int32',
                                       name='context_word_input')
            query_word_input = Input(shape=(self.config.max_utterance_len, ),
                                     dtype='int32',
                                     name='query_word_input')
            reply_word_input = Input(shape=(self.config.max_utterance_len, ),
                                     dtype='int32',
                                     name='reply_word_input')

            self.input_list.extend(
                [context_word_input, query_word_input, reply_word_input])

            # load word embeddings
            emb_matrix = load_word_embeddings(
                self.config, self.config.word_embeddings_file,
                load_vocab(self.config.word_vocab_file))
            word_emb_layer = Embedding(
                input_dim=self.config.word_vocab_size,
                output_dim=self.config.word_embeddings_dim,
                weights=[emb_matrix],
                trainable=True)
            context_word_emb = word_emb_layer(context_word_input)
            context_word_emb = Reshape((self.config.max_utterance_num, self.config.max_utterance_len, \
                self.config.word_embeddings_dim))(context_word_emb)
            query_word_emb = word_emb_layer(query_word_input)
            reply_word_emb = word_emb_layer(reply_word_input)

            context_emb = context_word_emb
            query_emb = query_word_emb
            reply_emb = reply_word_emb
        if self.config.use_char_embeddings:
            self.embedding_dim += self.config.char_features_dim
            context_char_input = Input(shape=(self.config.max_utterance_num,
                                              self.config.max_utterance_len,
                                              self.config.max_token_len),
                                       dtype='int32',
                                       name='context_char_input')
            query_char_input = Input(shape=(self.config.max_utterance_len,
                                            self.config.max_token_len),
                                     dtype='int32',
                                     name='query_char_input')
            reply_char_input = Input(shape=(self.config.max_utterance_len,
                                            self.config.max_token_len),
                                     dtype='int32',
                                     name='reply_char_input')

            self.input_list.extend(
                [context_char_input, query_char_input, reply_char_input])

            char_emb_layer = Embedding(
                input_dim=self.config.char_vocab_size,
                output_dim=self.config.char_embeddings_dim)
            context_char_emb = Reshape((self.config.max_utterance_num * self.config.max_utterance_len * self.config.max_token_len,\
                ))(context_char_input)
            query_char_emb = Reshape(
                (self.config.max_utterance_len *
                 self.config.max_token_len, ))(query_char_input)
            reply_char_emb = Reshape(
                (self.config.max_utterance_len *
                 self.config.max_token_len, ))(reply_char_input)
            context_char_emb = char_emb_layer(context_char_emb)
            query_char_emb = char_emb_layer(query_char_emb)
            reply_char_emb = char_emb_layer(reply_char_emb)

            char_cnn_layer = Conv1D(filters=self.config.char_features_dim, kernel_size=self.config.char_kernel_shape,\
                activation='tanh')
            context_char_emb = Reshape((self.config.max_utterance_num * self.config.max_utterance_len, \
                self.config.max_token_len, self.config.char_embeddings_dim))(context_char_emb)
            context_char_emb = TimeDistributed(char_cnn_layer)(
                context_char_emb)
            context_char_emb = TimeDistributed(
                GlobalMaxPooling1D())(context_char_emb)
            context_char_emb = Reshape(
                (self.config.max_utterance_num, self.config.max_utterance_len,
                 self.config.char_features_dim))(context_char_emb)

            query_char_emb = Reshape((self.config.max_utterance_len, self.config.max_token_len, \
                self.config.char_embeddings_dim))(query_char_emb)
            query_char_emb = TimeDistributed(char_cnn_layer)(query_char_emb)
            query_char_emb = TimeDistributed(
                GlobalMaxPooling1D())(query_char_emb)
            query_char_emb = Reshape(
                (self.config.max_utterance_len,
                 self.config.char_features_dim))(query_char_emb)

            reply_char_emb = Reshape((self.config.max_utterance_len, self.config.max_token_len, \
                self.config.char_embeddings_dim))(reply_char_emb)
            reply_char_emb = TimeDistributed(char_cnn_layer)(reply_char_emb)
            reply_char_emb = TimeDistributed(
                GlobalMaxPooling1D())(reply_char_emb)
            reply_char_emb = Reshape(
                (self.config.max_utterance_len,
                 self.config.char_features_dim))(reply_char_emb)

            if context_emb is not None:
                context_emb = ly.concatenate(
                    inputs=[context_emb, context_char_emb])
                query_emb = ly.concatenate(inputs=[query_emb, query_char_emb])
                reply_emb = ly.concatenate(inputs=[reply_emb, reply_char_emb])
            else:
                context_emb = context_char_emb
                query_emb = query_char_emb
                reply_emb = reply_char_emb

        self.embedding_dim += 1
        context_feature_input = Input(shape=(self.config.max_utterance_num * self.config.max_utterance_len,),\
            dtype='float32', name='context_feature_input')
        query_feature_input = Input(shape=(self.config.max_utterance_len,), \
            dtype='float32', name='query_feature_input')
        reply_feature_input = Input(shape=(self.config.max_utterance_len,),\
            dtype='float32', name='reply_feature_input')

        self.input_list.extend(
            [context_feature_input, query_feature_input, reply_feature_input])
        #assert context_emb is not None

        reply_emb = Dropout(self.config.dropout_rate)(reply_emb)
        context_feature = Reshape(
            (self.config.max_utterance_num, self.config.max_utterance_len,
             1))(context_feature_input)
        context_emb = ly.concatenate(inputs=[context_emb, context_feature])
        query_feature = Reshape(
            (self.config.max_utterance_len, 1))(query_feature_input)
        query_emb = ly.concatenate(inputs=[query_emb, query_feature])
        reply_feature = Reshape(
            (self.config.max_utterance_len, 1))(reply_feature_input)
        reply_emb = ly.concatenate(inputs=[reply_emb, reply_feature])

        if self.config.use_char_embeddings:
            return context_emb, context_char_emb, query_emb, query_char_emb, reply_emb, reply_char_emb
        else:
            return context_emb, query_emb, reply_emb
Пример #11
0
import preprocess

preprocess.preprocess_data(read_file="data/sentences.train",
                           write_file="data/sentences.train.preprocess",
                           vocab_size=20000)

vocab, inv_vocab = preprocess.load_vocab()
data = preprocess.load_preprocessed_data()

preprocess.preprocess_eval_data(vocab,
                                read_file="data/sentences.test",
                                write_file="data/sentences.test.preprocess")

eval_data = preprocess.load_preprocessed_data(
    read_file="data/sentences.test.preprocess")

print("[Preprocessing done]")
print("vocab len=" + str(len(vocab)))
print("sentences num=" + str(len(data)))
print("sentence len=" + str(len(data[0])))
print("test num=" + str(len(eval_data)))
print("test len=" + str(len(eval_data[0])))