예제 #1
0
    def prepare_model(self):
        with tf.variable_scope("LSTMTDNN"):
            self.char_inputs = []
            self.word_inputs = []
            self.cnn_outputs = []

            if self.use_char:
                char_W = tf.get_variable(
                    "char_embed", [self.char_vocab_size, self.char_embed_dim])
            else:
                word_W = tf.get_variable(
                    "word_embed", [self.word_vocab_size, self.word_embed_dim])

            with tf.variable_scope("CNN") as scope:
                self.char_inputs = tf.placeholder(
                    tf.int32,
                    [self.batch_size, self.seq_length, self.max_word_length])
                self.word_inputs = tf.placeholder(
                    tf.int32, [self.batch_size, self.seq_length])

                char_indices = tf.split(1, self.seq_length, self.char_inputs)
                word_indices = tf.split(1, self.seq_length,
                                        tf.expand_dims(self.word_inputs, -1))

                for idx in xrange(self.seq_length):
                    char_index = tf.reshape(char_indices[idx],
                                            [-1, self.max_word_length])
                    word_index = tf.reshape(word_indices[idx], [-1, 1])

                    if idx != 0:
                        scope.reuse_variables()

                    if self.use_char:
                        # [batch_size x word_max_length, char_embed]
                        char_embed = tf.nn.embedding_lookup(char_W, char_index)

                        char_cnn = TDNN(char_embed, self.char_embed_dim,
                                        self.feature_maps, self.kernels)

                        if self.use_word:
                            word_embed = tf.nn.embedding_lookup(
                                word_W, word_index)
                            cnn_output = tf.concat(1, char_cnn.output,
                                                   word_embed)
                        else:
                            cnn_output = char_cnn.output
                    else:
                        cnn_output = tf.squeeze(
                            tf.nn.embedding_lookup(word_W, word_index))

                    if self.use_batch_norm:
                        bn = batch_norm()
                        norm_output = bn(
                            tf.expand_dims(tf.expand_dims(cnn_output, 1), 1))
                        cnn_output = tf.squeeze(norm_output)

                    if highway:
                        #cnn_output = highway(input_, input_dim_length, self.highway_layers, 0)
                        cnn_output = highway(cnn_output,
                                             cnn_output.get_shape()[1],
                                             self.highway_layers, 0)

                    self.cnn_outputs.append(cnn_output)

            with tf.variable_scope("LSTM") as scope:
                self.cell = rnn_cell.BasicLSTMCell(self.rnn_size)
                self.stacked_cell = rnn_cell.MultiRNNCell([self.cell] *
                                                          self.layer_depth)

                outputs, _ = rnn.rnn(self.stacked_cell,
                                     self.cnn_outputs,
                                     dtype=tf.float32)

                self.lstm_outputs = []
                self.true_outputs = tf.placeholder(
                    tf.float32,
                    [self.batch_size, self.seq_length, self.word_vocab_size])

                loss = 0
                true_outputs = tf.split(1, self.seq_length, self.true_outputs)

                for idx, (top_h,
                          true_output) in enumerate(zip(outputs,
                                                        true_outputs)):
                    if self.dropout_prob > 0:
                        top_h = tf.nn.dropout(top_h, self.dropout_prob)

                    if self.hsm > 0:
                        self.lstm_outputs.append(top_h)
                    else:
                        if idx != 0:
                            scope.reuse_variables()
                        proj = rnn_cell.linear(top_h, self.word_vocab_size, 0)
                        log_softmax = tf.log(tf.nn.softmax(proj))
                        self.lstm_outputs.append(log_softmax)

                    loss += tf.nn.softmax_cross_entropy_with_logits(
                        self.lstm_outputs[idx], tf.squeeze(true_output))

                self.loss = tf.reduce_mean(loss) / self.seq_length

                tf.scalar_summary("loss", self.loss)
                tf.scalar_summary("perplexity", tf.exp(self.loss))
예제 #2
0
    def __init__(self, config):
        """Build model(define computational blocks).

        Args:
          config: an instance of Config class.
        """
        self.config = config
        self.embvec = config.embvec
        self.wrd_vocab_size = len(self.embvec.wrd_embeddings)
        self.wrd_dim = config.wrd_dim
        self.word_length = config.word_length
        self.chr_vocab_size = len(self.embvec.chr_vocab)
        self.chr_dim = config.chr_dim
        self.pos_vocab_size = len(self.embvec.pos_vocab)
        self.pos_dim = config.pos_dim
        self.chk_vocab_size = len(self.embvec.chk_vocab)
        self.chk_dim = config.chk_dim
        self.class_size = config.class_size
        self.use_crf = config.use_crf
        self.emb_class = config.emb_class
        self.is_training = config.is_training
        self.print_local_devices(self.is_training)

        """
        Input layer
        """
        self.is_train = tf.placeholder(tf.bool, name='is_train')
        self.sentence_length = tf.placeholder(tf.int32, name='sentence_length')
        self.keep_prob = tf.cond(self.is_train, lambda: config.keep_prob, lambda: 1.0)

        # pos embedding
        self.input_data_pos_ids = tf.placeholder(tf.int32, shape=[None, None], name='input_data_pos_ids') # (batch_size, sentence_length)
        self.sentence_masks   = self.__compute_sentence_masks(self.input_data_pos_ids)
        sentence_lengths = self.__compute_sentence_lengths(self.sentence_masks)
        self.sentence_lengths = tf.identity(sentence_lengths, name='sentence_lengths')
        masks = tf.to_float(tf.expand_dims(self.sentence_masks, -1)) # (batch_size, sentence_length, 1)
        self.pos_embeddings = self.__pos_embedding(self.input_data_pos_ids, keep_prob=self.keep_prob, scope='pos-embedding')

        # chk embedding
        self.input_data_chk_ids = tf.placeholder(tf.int32, shape=[None, None], name='input_data_chk_ids') # (batch_size, sentence_length)
        self.chk_embeddings = self.__chk_embedding(self.input_data_chk_ids, keep_prob=self.keep_prob, scope='chk-embedding')

        # (large) word embedding data
        self.wrd_embeddings_init = tf.placeholder(tf.float32, shape=[self.wrd_vocab_size, self.wrd_dim], name='wrd_embeddings_init')
        self.wrd_embeddings = tf.Variable(self.wrd_embeddings_init, name='wrd_embeddings', trainable=False)
        # word embeddings
        self.input_data_word_ids = tf.placeholder(tf.int32, shape=[None, None], name='input_data_word_ids') # (batch_size, sentence_length)
        self.word_embeddings = self.__word_embedding(self.input_data_word_ids, keep_prob=self.keep_prob, scope='word-embedding')

        # character embeddings
        self.input_data_wordchr_ids = tf.placeholder(tf.int32,
                                                     shape=[None, None, self.word_length], # (batch_size, sentence_length, word_length)
                                                     name='input_data_wordchr_ids')
        if config.chr_conv_type == 'conv1d':
            self.wordchr_embeddings = self.__wordchr_embedding_conv1d(self.input_data_wordchr_ids,
                                                                      keep_prob=self.keep_prob,
                                                                      scope='wordchr-embedding-conv1d')
        else:
            self.wordchr_embeddings = self.__wordchr_embedding_conv2d(self.input_data_wordchr_ids,
                                                                      keep_prob=self.keep_prob,
                                                                      scope='wordchr-embedding-conv2d')

        if 'elmo' in self.emb_class:
            # elmo embeddings
            self.elmo_bilm = config.elmo_bilm
            elmo_keep_prob = tf.cond(self.is_train, lambda: config.elmo_keep_prob, lambda: 1.0)
            self.elmo_input_data_wordchr_ids = tf.placeholder(tf.int32,
                                                              shape=[None, None, self.word_length], # (batch_size, sentence_length+2, word_length)
                                                              name='elmo_input_data_wordchr_ids')   # '+2' stands for '<S>', '</S>'
            self.elmo_embeddings = self.__elmo_embedding(self.elmo_input_data_wordchr_ids, masks, keep_prob=elmo_keep_prob)
        if 'bert' in self.emb_class:
            # bert embeddings in subgraph
            self.bert_config = config.bert_config
            self.bert_init_checkpoint = config.bert_init_checkpoint
            self.bert_input_data_token_ids   = tf.placeholder(tf.int32, shape=[None, config.bert_max_seq_length], name='bert_input_data_token_ids')
            self.bert_input_data_token_masks = tf.placeholder(tf.int32, shape=[None, config.bert_max_seq_length], name='bert_input_data_token_masks') 
            self.bert_input_data_segment_ids = tf.placeholder(tf.int32, shape=[None, config.bert_max_seq_length], name='bert_input_data_segment_ids') 
            bert_embeddings_subgraph = self.__bert_embedding(self.bert_input_data_token_ids,
                                                             self.bert_input_data_token_masks,
                                                             self.bert_input_data_segment_ids)
            self.bert_embeddings_subgraph = tf.identity(bert_embeddings_subgraph, name='bert_embeddings_subgraph')

            # bert embedding at runtime
            self.bert_embeddings = tf.placeholder(tf.float32, shape=[None, config.bert_max_seq_length, config.bert_dim], name='bert_embeddings')
            bert_keep_prob = tf.cond(self.is_train, lambda: config.bert_keep_prob, lambda: 1.0)
            self.bert_embeddings = tf.nn.dropout(self.bert_embeddings, bert_keep_prob)

        concat_in = [self.word_embeddings, self.wordchr_embeddings, self.pos_embeddings, self.chk_embeddings]
        if self.emb_class == 'elmo':
            concat_in = [self.word_embeddings, self.wordchr_embeddings, self.elmo_embeddings, self.pos_embeddings, self.chk_embeddings]
        if self.emb_class == 'bert':
            concat_in = [self.word_embeddings, self.wordchr_embeddings, self.bert_embeddings, self.pos_embeddings, self.chk_embeddings]
        if self.emb_class == 'bert+elmo':
            concat_in = [self.word_embeddings, self.wordchr_embeddings, self.bert_embeddings, self.elmo_embeddings, self.pos_embeddings, self.chk_embeddings]
        self.input_data = tf.concat(concat_in, axis=-1, name='input_data') # (batch_size, sentence_length, input_dim)
        
        # highway network
        if config.highway_used:
            input_dim = self.input_data.get_shape()[-1]
            self.input_data = tf.reshape(self.input_data, [-1, input_dim]) 
            self.input_data = highway(self.input_data, input_dim, num_layers=2, scope='highway')
            self.input_data = tf.reshape(self.input_data, [-1, self.sentence_length, input_dim])
            self.input_data = tf.nn.dropout(self.input_data, keep_prob=self.keep_prob)

        # masking (for confirmation)
        self.input_data *= masks

        """
        RNN layer
        """
        self.rnn_output = self.__bi_rnn(self.input_data)

        """
        Transformer layer
        """
        self.transformed_output = self.__transform(self.rnn_output, masks)

        """
        Projection layer
        """
        self.logits = self.__projection(self.transformed_output,
                                        self.class_size,
                                        scope='projection') # (batch_size, sentence_length, class_size)

        """
        Output answer
        """
        self.output_data = tf.placeholder(tf.float32,
                                          shape=[None, None, self.class_size], # (batch_size, sentence_length, class_size)
                                          name='output_data')
        self.output_data_indices = tf.argmax(self.output_data, axis=-1, output_type=tf.int32) # (batch_size, sentence_length)

        """
        Prediction
        """
        self.prediction = self.__compute_prediction()
        self.logits_indices = tf.identity(self.prediction, name='logits_indices')
  def prepare_model(self):
    with tf.variable_scope("LSTMTDNN"):
      self.char_inputs = []
      self.word_inputs = []
      self.cnn_outputs = []

      if self.use_char:
        char_W = tf.get_variable("char_embed",
            [self.char_vocab_size, self.char_embed_dim])
      if self.use_word:
        word_W = tf.get_variable("word_embed",
            [self.word_vocab_size, self.word_embed_dim])

      with tf.variable_scope("CNN") as scope:
        self.char_inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_length, self.max_word_length])
        self.word_inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_length])

        char_indices = tf.split(1, self.seq_length, self.char_inputs)
        word_indices = tf.split(1, self.seq_length, tf.expand_dims(self.word_inputs, -1))

        for idx in xrange(self.seq_length):
          char_index = tf.reshape(char_indices[idx], [-1, self.max_word_length])
          word_index = tf.reshape(word_indices[idx], [-1, 1])

          if idx != 0:
            scope.reuse_variables()

          if self.use_char:
            # [batch_size x word_max_length, char_embed]
            char_embed = tf.nn.embedding_lookup(char_W, char_index)

            char_cnn = TDNN(char_embed, self.char_embed_dim, self.feature_maps, self.kernels)

            if self.use_word:
              word_embed = tf.nn.embedding_lookup(word_W, word_index)
              cnn_output = tf.concat(1, [char_cnn.output, tf.squeeze(word_embed, [1])])
            else:
              cnn_output = char_cnn.output
          else:
            cnn_output = tf.squeeze(tf.nn.embedding_lookup(word_W, word_index))

          if self.use_batch_norm:
            bn = batch_norm()
            norm_output = bn(tf.expand_dims(tf.expand_dims(cnn_output, 1), 1))
            cnn_output = tf.squeeze(norm_output)

          if highway:
            #cnn_output = highway(input_, input_dim_length, self.highway_layers, 0)
            cnn_output = highway(cnn_output, cnn_output.get_shape()[1], self.highway_layers, 0)

          self.cnn_outputs.append(cnn_output)

      with tf.variable_scope("LSTM") as scope:
        self.cell = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size)
        self.stacked_cell = tf.nn.rnn_cell.MultiRNNCell([self.cell] * self.layer_depth)

        outputs, _ = tf.nn.rnn(self.stacked_cell,
                               self.cnn_outputs,
                               dtype=tf.float32)

        self.lstm_outputs = []
        self.true_outputs = tf.placeholder(tf.int64,
            [self.batch_size, self.seq_length])

        loss = 0
        true_outputs = tf.split(1, self.seq_length, self.true_outputs)

        for idx, (top_h, true_output) in enumerate(zip(outputs, true_outputs)):
          if self.dropout_prob > 0:
            top_h = tf.nn.dropout(top_h, self.dropout_prob)

          if self.hsm > 0:
            self.lstm_outputs.append(top_h)
          else:
            if idx != 0:
              scope.reuse_variables()
            proj = tf.nn.rnn_cell._linear(top_h, self.word_vocab_size, 0)
            self.lstm_outputs.append(proj)

          loss += tf.nn.sparse_softmax_cross_entropy_with_logits(self.lstm_outputs[idx], tf.squeeze(true_output))

        self.loss = tf.reduce_mean(loss) / self.seq_length

        tf.scalar_summary("loss", self.loss)
        tf.scalar_summary("perplexity", tf.exp(self.loss))
예제 #4
0
    def _build(self, pretrained_word_embedding, bert_word_embedding):
        with tf.variable_scope("LSTMTDNN"):
            with tf.device('/cpu:0'):
                if self.use_char:
                    self.char_embedding = tf.get_variable(
                        "char_matrix", [self.char_vocab_size, self.c_emb_size],
                        initializer=tf.uniform_unit_scaling_initializer())

                if self.use_pts:
                    self.postag_embedding = tf.get_variable(
                        "postag_matrix",
                        [self.postag_vocab_size, self.pt_emb_size],
                        initializer=tf.uniform_unit_scaling_initializer())

                if self.use_word:
                    if pretrained_word_embedding is None:
                        self.word_embedding = tf.get_variable(
                            "word_matrix",
                            [self.word_vocab_size, self.w_emb_size],
                            initializer=tf.uniform_unit_scaling_initializer())
                    else:
                        self.word_embedding = tf.get_variable(
                            "word_matrix",
                            [self.word_vocab_size, self.w_emb_size],
                            initializer=tf.constant_initializer(
                                pretrained_word_embedding),
                            trainable=False)

                # char_vecs sentence_len x max_word_len x embedding_len
                if self.use_bert_word:
                    if bert_word_embedding is None:
                        self.bio_bert_word_embedding = tf.get_variable(
                            "bio_bert_word_matrix", [
                                self.bert_vocab_size,
                                self.bert_word_embedding_dim
                            ],
                            initializer=tf.uniform_unit_scaling_initializer())
                    else:
                        self.bio_bert_word_embedding = tf.get_variable(
                            "bio_bert_word_matrix", [
                                self.bert_vocab_size,
                                self.bert_word_embedding_dim
                            ],
                            initializer=tf.constant_initializer(
                                bert_word_embedding),
                            trainable=False)
                if self.use_char:
                    char_vecs = tf.nn.embedding_lookup(self.char_embedding,
                                                       self.char_input)

                # word_vec  sentence_len x embedding_len
                if self.use_word:
                    word_vecs = tf.nn.embedding_lookup(self.word_embedding,
                                                       self.word_input)
                if self.use_bert_word:
                    bio_word_vecs = tf.nn.embedding_lookup(
                        self.bio_bert_word_embedding, self.bio_word_input)
                # postag_vec sentence_len x embedding_len
                if self.use_pts:
                    pt_vecs = tf.nn.embedding_lookup(self.postag_embedding,
                                                     self.pt_input)

            # char_embedding layer
            if self.use_char:
                char_cnn = TDNN(char_vecs,
                                feature_maps=self.feature_maps,
                                kernels=self.kernels,
                                embed_dim=self.c_emb_size)
                # if self.use_pts:
                # 	combined_emb = tf.concat([pt_vecs, char_cnn.output], 1)
                # else:
                combined_emb = char_cnn.output
            if self.use_word:
                combined_emb = tf.concat([word_vecs, combined_emb], 1)

            if self.use_bert_word:
                combined_emb = tf.concat([bio_word_vecs, combined_emb], 1)

            combined_emb = tf.reshape(combined_emb, [-1, self.total_emb_size])

            if self.highway:
                combined_emb = highway(combined_emb,
                                       self.total_emb_size,
                                       layer_size=1)

            combined_emb = tf.reshape(combined_emb, [-1, self.total_emb_size])
            combined_emb = tf.expand_dims(combined_emb, 0)
            combined_emb = tf.nn.dropout(combined_emb,
                                         keep_prob=1 - self.drop_rate)

            if not self.padding:
                lstm_cell_fw_1 = tf.contrib.rnn.BasicLSTMCell(self.h_size)
                lstm_cell_bw_1 = tf.contrib.rnn.BasicLSTMCell(self.h_size)
                lstm_cell_fw = tf.contrib.rnn.MultiRNNCell([lstm_cell_fw_1] *
                                                           self.lstm_layers,
                                                           state_is_tuple=True)
                lstm_cell_bw = tf.contrib.rnn.MultiRNNCell([lstm_cell_bw_1] *
                                                           self.lstm_layers,
                                                           state_is_tuple=True)
                self.outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=lstm_cell_fw,
                    cell_bw=lstm_cell_bw,
                    inputs=combined_emb,
                    dtype=tf.float32,
                    sequence_length=self.__length(combined_emb))

                out = tf.concat([self.outputs[0], self.outputs[1]], 2)

                # two layer NN
                w_1 = tf.get_variable("w_1", [self.h_size * 2, self.h_size])
                b_1 = tf.get_variable("b_1", [self.h_size])
                linear1 = tf.matmul(tf.reshape(out, [-1, self.h_size * 2]),
                                    w_1) + b_1
                w_3 = tf.get_variable("w_3", [self.h_size, self.num_classes])
                b_3 = tf.get_variable("b_3", [self.num_classes])
                self.logits = tf.matmul(tf.tanh(linear1), w_3) + b_3

            else:
                line_layer = 200
                gram_cnn = n_gram(combined_emb,
                                  embed_dim=self.total_emb_size,
                                  max_seq_len=self.max_seq_len)
                # gram_cnn = fcn(combined_emb, embed_dim = self.total_emb_size, max_seq_len = self.max_seq_len)
                cnn_output = gram_cnn.output

                if self.use_pts:
                    cnn_output = tf.concat([pt_vecs, cnn_output], 1)

                w_1 = tf.get_variable("w_1",
                                      [cnn_output.get_shape()[1], line_layer])
                b_1 = tf.get_variable("b_1", [line_layer])
                linear1 = tf.matmul(cnn_output, w_1) + b_1
                w_2 = tf.get_variable("w_2", [line_layer, self.num_classes])
                b_2 = tf.get_variable("b_2", [self.num_classes])
                self.logits = tf.matmul(tf.tanh(linear1), w_2) + b_2

            if not self.crf:
                self.loss = tf.reduce_mean(
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        self.logits, self.target))
                soft_max = tf.nn.softmax(self.logits)
                self.y_pred = tf.argmax(soft_max, axis=1)
            else:
                # use crf to do post processing
                unary_scores = tf.reshape(self.logits,
                                          [1, -1, self.num_classes])
                if not self.padding:
                    log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(
                        unary_scores, tf.reshape(self.target, [1, -1]),
                        self.__length(combined_emb))
                else:
                    log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(
                        unary_scores, tf.reshape(self.target, [1, -1]),
                        self.s_len)

                self.loss = tf.reduce_mean(-log_likelihood)

            self.global_step = tf.Variable(0,
                                           name='global_step',
                                           trainable=False,
                                           collections=[
                                               tf.GraphKeys.GLOBAL_STEP,
                                               tf.GraphKeys.GLOBAL_VARIABLES
                                           ])

            self.learning_rate = tf.train.exponential_decay(
                0.002,  # Base learning rate.
                self.global_step,  # Current index into the dataset.
                20 * self.train_size,  # Decay step.
                0.95,  # Decay rate.
                staircase=True)

            self.opt = tf.train.MomentumOptimizer(self.learning_rate, 0.9)

            params = tf.trainable_variables()
            grads = []
            for grad in tf.gradients(self.loss, params):
                if grad is not None:
                    grads.append(tf.clip_by_norm(grad, self.max_grad_norm))
                else:
                    grads.append(grad)
            self.optim = self.opt.apply_gradients(zip(grads, params),
                                                  global_step=self.global_step)