def build_model(self):
    # Representation Generator
    self.inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_length])

    embed = tf.get_variable("embed", [self.vocab_size, self.embed_dim])
    word_embeds = tf.nn.embedding_lookup(embed, self.inputs)

    self.cell = rnn_cell.BasicLSTMCell(self.rnn_size)
    self.stacked_cell = rnn_cell.MultiRNNCell([self.cell] * self.layer_depth)

    outputs, _ = rnn.rnn(self.cell,
                         [tf.squeeze(embed_t) for embed_t in tf.split(1, self.seq_length, word_embeds)],
                         dtype=tf.float32)

    output_embed = tf.pack(outputs)
    mean_pool = tf.nn.relu(tf.reduce_mean(output_embed, 1))

    self.num_action = 4
    self.object_size = 4

    # Action scorer. no bias in paper
    self.pred_action = rnn_cell.linear(mean_pool, self.num_action, 0, "action")
    self.object_ = rnn_cell.linear(mean_pool, self.object_size, 0, "object")

    self.true_action = tf.placeholder(tf.int32, [self.batch_size, self.num_action])
Пример #2
0
			def attention(input_t,output_t_minus_1,time):
				with tf.variable_scope('attention'):
					VxS = tf.reshape(rnn_cell.linear(output_t_minus_1,self.attention_judge_size,True),[-1,1,1,self.attention_judge_size]) #batch_size x 1 x 1 x attention
				_exp = tf.exp(tf.reduce_sum( attention_V * tf.tanh(WxH+VxS), [3]))#batch_size x source_len x 1
				_exp = _exp*tf.expand_dims(self.mask,-1)
				attention_weight = _exp/tf.reduce_sum(_exp,[1], keep_dims=True)
				attention_t = tf.reduce_sum(encoder_outputs*attention_weight,[1])
				feed_in_t = tf.tanh(rnn_cell.linear([attention_t,input_t],self.embedding_size,True))
				return feed_in_t
Пример #3
0
			def attention(input_t,output_t_minus_1,time):
				with tf.variable_scope('attention'):
					VxS = tf.reshape(rnn_cell.linear(output_t_minus_1,self.attention_judge_size,True),[-1,1,1,self.attention_judge_size]) #batch_size x 1 x 1 x attention
				_exp = tf.exp(tf.reduce_sum( attention_V * tf.tanh(WxH+VxS), [3]))#batch_size x source_len x 1
				_exp = _exp*tf.expand_dims(self.mask,-1)
				attention_weight = _exp/tf.reduce_sum(_exp,[1], keep_dims=True)
				attention_t = tf.reduce_sum(encoder_outputs*attention_weight,[1])
				feed_in_t = tf.tanh(rnn_cell.linear([attention_t,input_t],self.embedding_size,True))
				return feed_in_t
Пример #4
0
 def __call__(self, inputs, state, scope=None):
   """Gated recurrent unit (GRU) with nunits cells."""
   with vs.variable_scope(scope or type(self).__name__):  # "GRUCell"                 
     with vs.variable_scope("Gates"):  # Reset gate and update gate.                  
       # We start with bias of 1.0 to not reset and not update.                       
       r, u = array_ops.split(1, 2, rnn_cell.linear([inputs, state],
                                           2 * self._num_units, True, 1.0))
       r, u = tf.sigmoid(r), tf.sigmoid(u)
     with vs.variable_scope("Candidate"):
       c = self._activation(rnn_cell.linear([inputs, r * state], self._num_units, True))
     new_h = u * state + (1 - u) * c
   return new_h, new_h
Пример #5
0
    def __call__(self, inputs, state, con):
        with tf.variable_scope(type(self).__name__):
            # batch_size x 3(cluster_size)
            concat = rnn_cell.linear(inputs, 3 * self._cluster_size, True)
            a, b, k = tf.split(1, 3, concat)
            ao = tf.exp(a)
            bo = tf.exp(b)
            ko = state + tf.exp(k)  # batch_size x _cluster_size

            phi = []
            for i in range(self._con_size):
                # each phi is [batch_size x 1]
                phi.append(
                    tf.reduce_sum(ao * tf.exp(-bo * tf.square(ko - i)),
                                  1,
                                  keep_dims=True))

            # tf.concat(1, phi) -> [batch_size x seq_length]
            # tf.expan_dims(%, 1) -> [batch_size x 1 x seq_length]
            # tf.batch_matmul(%, con) -> [batch_size x 1 x vocab_size]
            # tf.squeeze(%) -> [batch_size x vocab_size]
            wt = tf.squeeze(
                tf.batch_matmul(tf.expand_dims(tf.concat(1, phi), 1), con),
                [1])
        return wt, ko
def highway(input_, size, layer_size=1, bias=-2, f=tf.nn.relu):
    """Highway Network (cf. http://arxiv.org/abs/1505.00387).

    t = sigmoid(Wy + b)
    z = t * g(Wy + b) + (1 - t) * y
    where g is nonlinearity, t is transform gate, and (1 - t) is carry gate.
    """
    output = input_
    for idx in xrange(layer_size):
        output = f(rnn_cell.linear(output, size, 0, scope='output_lin_%d' % idx))

    transform_gate = tf.sigmoid(
        rnn_cell.linear(input_, size, 0, scope='transform_lin_%d' % idx) + bias)
    carry_gate = 1. - transform_gate

    output = transform_gate * output + carry_gate * input_

    return output
Пример #7
0
 def __call__(self, inputs, scope=None):
     """
     :param inputs: list of 2D Tensors with shape [batch_size x self.from_size]
     :return: list of 2D Tensors with shape [batch_size x self.to_size]
     """
     with vs.variable_scope(scope or "Projector"):
         projected = linear(inputs, self.to_size, self.bias)
         if self.non_linearity is not None:
             projected = self.non_linearity(projected)
     return projected
def highway(input_, size, layer_size=1, bias=-2, f=tf.nn.relu):
    """Highway Network (cf. http://arxiv.org/abs/1505.00387).

    t = sigmoid(Wy + b)
    z = t * g(Wy + b) + (1 - t) * y
    where g is nonlinearity, t is transform gate, and (1 - t) is carry gate.
    """
    output = input_
    for idx in xrange(layer_size):
        output = f(
            rnn_cell.linear(output, size, 0, scope='output_lin_%d' % idx))

    transform_gate = tf.sigmoid(
        rnn_cell.linear(input_, size, 0, scope='transform_lin_%d' % idx) +
        bias)
    carry_gate = 1. - transform_gate

    output = transform_gate * output + carry_gate * input_

    return output
Пример #9
0
  def __call__(self, inputs, state, scope=None):
    """Long short-term memory cell (LSTM)."""
    with vs.variable_scope(scope or type(self).__name__):  # "BasicLSTMCell"           
      # Parameters of gates are concatenated into one multiply for efficiency.         
      c, h = array_ops.split(1, 2, state)
      concat = rnn_cell.linear([inputs, h], 4 * self._num_units, True)

      # i = input_gate, j = new_input, f = forget_gate, o = output_gate                
      i, j, f, o = array_ops.split(1, 4, concat)

      new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * self._activation(j)
      new_h = self._activation(new_c) * tf.sigmoid(o)

      return new_h, array_ops.concat(1, [new_c, new_h])
  def __call__(self, inputs, state, scope=None):
    """Long short-term memory cell (LSTM)."""
    with tf.variable_scope("BasicLSTMCell"):
      h = state
      if self.c == None: self.c = tf.reshape(tf.zeros_like(h), [-1, self._num_units])
      concat = linear([inputs, h, self.c], 4 * self._num_units, True)

      i, j, f, o = tf.split(1, 4, concat)

      self.c = self.c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * tf.tanh(j)
      new_h = tf.tanh(self.c) * tf.sigmoid(o)

      softmax_w = tf.get_variable("softmax_w", [self._num_units, self._num_units])
      softmax_b = tf.get_variable("softmax_b", [self._num_units])

      new_y = tf.nn.xw_plus_b(new_h, softmax_w, softmax_b)

    return new_y, new_y
    def __call__(self, inputs, state, scope=None):
        """Long short-term memory cell (LSTM)."""
        with tf.variable_scope("BasicLSTMCell"):
            h = state
            if self.c == None:
                self.c = tf.reshape(tf.zeros_like(h), [-1, self._num_units])
            concat = linear([inputs, h, self.c], 4 * self._num_units, True)

            i, j, f, o = tf.split(1, 4, concat)

            self.c = self.c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(
                i) * tf.tanh(j)
            new_h = tf.tanh(self.c) * tf.sigmoid(o)

            softmax_w = tf.get_variable("softmax_w",
                                        [self._num_units, self._num_units])
            softmax_b = tf.get_variable("softmax_b", [self._num_units])

            new_y = tf.nn.xw_plus_b(new_h, softmax_w, softmax_b)

        return new_y, new_y
Пример #12
0
    def __call__(self, inputs, state, con):
        with tf.variable_scope(type(self).__name__):
            # batch_size x 3(cluster_size)
            concat = rnn_cell.linear(inputs, 3 * self._cluster_size, True)
            a, b, k = tf.split(1, 3, concat)
            ao = tf.exp(a)
            bo = tf.exp(b)
            ko = state + tf.exp(k)  # batch_size x _cluster_size

            phi = []
            for i in range(self._con_size):
                # each phi is [batch_size x 1]
                phi.append(tf.reduce_sum(ao * tf.exp(- bo * tf.square(ko - i)), 1, keep_dims=True))


            # tf.concat(1, phi) -> [batch_size x seq_length]
            # tf.expan_dims(%, 1) -> [batch_size x 1 x seq_length]
            # tf.batch_matmul(%, con) -> [batch_size x 1 x vocab_size]
            # tf.squeeze(%) -> [batch_size x vocab_size]
            wt = tf.squeeze(tf.batch_matmul(tf.expand_dims(tf.concat(1, phi), 1), con), [1])
        return wt, ko
Пример #13
0
    def prepare_model(self):
        with tf.variable_scope("LSTMTDNN"):
            self.char_inputs = []
            self.word_inputs = []
            self.cnn_outputs = []

            if self.use_char:
                char_W = tf.get_variable(
                    "char_embed", [self.char_vocab_size, self.char_embed_dim])
            else:
                word_W = tf.get_variable(
                    "word_embed", [self.word_vocab_size, self.word_embed_dim])

            with tf.variable_scope("CNN") as scope:
                self.char_inputs = tf.placeholder(
                    tf.int32,
                    [self.batch_size, self.seq_length, self.max_word_length])
                self.word_inputs = tf.placeholder(
                    tf.int32, [self.batch_size, self.seq_length])

                char_indices = tf.split(1, self.seq_length, self.char_inputs)
                word_indices = tf.split(1, self.seq_length,
                                        tf.expand_dims(self.word_inputs, -1))

                for idx in xrange(self.seq_length):
                    char_index = tf.reshape(char_indices[idx],
                                            [-1, self.max_word_length])
                    word_index = tf.reshape(word_indices[idx], [-1, 1])

                    if idx != 0:
                        scope.reuse_variables()

                    if self.use_char:
                        # [batch_size x word_max_length, char_embed]
                        char_embed = tf.nn.embedding_lookup(char_W, char_index)

                        char_cnn = TDNN(char_embed, self.char_embed_dim,
                                        self.feature_maps, self.kernels)

                        if self.use_word:
                            word_embed = tf.nn.embedding_lookup(
                                word_W, word_index)
                            cnn_output = tf.concat(1, char_cnn.output,
                                                   word_embed)
                        else:
                            cnn_output = char_cnn.output
                    else:
                        cnn_output = tf.squeeze(
                            tf.nn.embedding_lookup(word_W, word_index))

                    if self.use_batch_norm:
                        bn = batch_norm()
                        norm_output = bn(
                            tf.expand_dims(tf.expand_dims(cnn_output, 1), 1))
                        cnn_output = tf.squeeze(norm_output)

                    if highway:
                        #cnn_output = highway(input_, input_dim_length, self.highway_layers, 0)
                        cnn_output = highway(cnn_output,
                                             cnn_output.get_shape()[1],
                                             self.highway_layers, 0)

                    self.cnn_outputs.append(cnn_output)

            with tf.variable_scope("LSTM") as scope:
                self.cell = rnn_cell.BasicLSTMCell(self.rnn_size)
                self.stacked_cell = rnn_cell.MultiRNNCell([self.cell] *
                                                          self.layer_depth)

                outputs, _ = rnn.rnn(self.stacked_cell,
                                     self.cnn_outputs,
                                     dtype=tf.float32)

                self.lstm_outputs = []
                self.true_outputs = tf.placeholder(
                    tf.float32,
                    [self.batch_size, self.seq_length, self.word_vocab_size])

                loss = 0
                true_outputs = tf.split(1, self.seq_length, self.true_outputs)

                for idx, (top_h,
                          true_output) in enumerate(zip(outputs,
                                                        true_outputs)):
                    if self.dropout_prob > 0:
                        top_h = tf.nn.dropout(top_h, self.dropout_prob)

                    if self.hsm > 0:
                        self.lstm_outputs.append(top_h)
                    else:
                        if idx != 0:
                            scope.reuse_variables()
                        proj = rnn_cell.linear(top_h, self.word_vocab_size, 0)
                        log_softmax = tf.log(tf.nn.softmax(proj))
                        self.lstm_outputs.append(log_softmax)

                    loss += tf.nn.softmax_cross_entropy_with_logits(
                        self.lstm_outputs[idx], tf.squeeze(true_output))

                self.loss = tf.reduce_mean(loss) / self.seq_length

                tf.scalar_summary("loss", self.loss)
                tf.scalar_summary("perplexity", tf.exp(self.loss))
Пример #14
0
 def __call__(self, inputs, state, scope=None):
   """Most basic RNN: output = new_state = tanh(W * input + U * state + B)."""
   with vs.variable_scope(scope or type(self).__name__):  # "BasicRNNCell"            
     output = self._activation(rnn_cell.linear([inputs, state], self._num_units, True))
   return output, output
Пример #15
0
    def build_model(self):
        with tf.variable_scope('RNNTEST'):
            self.sense = tf.placeholder(tf.int32, [None])
            self.arg1 = tf.placeholder(tf.int32, [None, None, 4])
            self.arg2 = tf.placeholder(tf.int32, [None, None, 4])
            self.arg1_len = tf.placeholder(tf.int32, [None])
            self.arg2_len = tf.placeholder(tf.int32, [None])
            self.keep_prob = tf.placeholder(tf.float32)

            arg1_list = tf.split(2, 4, self.arg1)
            arg2_list = tf.split(2, 4, self.arg2)

            with tf.device('/cpu:0'):
                NER_W = tf.get_variable('NER_embed', [
                    self.data_loader.NER_vocab_size, self.NER_embed_size
                ]) if self.NER_embed_size > 0 else None
                lemma_W = tf.get_variable('lemma_embed', [
                    self.data_loader.lemma_vocab_size, self.lemma_embed_size
                ]) if self.lemma_embed_size > 0 else None
                if self.use_pre_trained_embedding:
                    word_W = tf.get_variable(
                        'word_embed',
                        initializer=tf.convert_to_tensor(
                            self.data_loader.pre_trained_word_embeddings,
                            dtype=tf.float32)
                    ) if self.word_embed_size > 0 else None
                else:
                    word_W = tf.get_variable(
                        'word_embed',
                        shape=[
                            self.data_loader.word_vocab_size,
                            self.word_embed_size
                        ]) if self.word_embed_size > 0 else None
                POS_W = tf.get_variable('POS_embed', [
                    self.data_loader.POS_vocab_size, self.POS_embed_size
                ]) if self.POS_embed_size > 0 else None
            arg1_embed_list = []
            arg2_embed_list = []
            for idx, W in enumerate([NER_W, lemma_W, word_W, POS_W]):
                if W is not None:
                    arg1_embed_list.append(
                        tf.nn.embedding_lookup(W,
                                               tf.squeeze(arg1_list[idx],
                                                          [2])))
                    arg2_embed_list.append(
                        tf.nn.embedding_lookup(W,
                                               tf.squeeze(arg2_list[idx],
                                                          [2])))
            arg1 = tf.nn.dropout(tf.concat(2, arg1_embed_list), self.keep_prob)
            arg2 = tf.nn.dropout(tf.concat(2, arg2_embed_list), self.keep_prob)

            encoder_lstm_unit = rnn_cell.BasicLSTMCell(self.encoder_size)
            decoder_lstm_unit = rnn_cell.BasicLSTMCell(self.decoder_size)

            with tf.variable_scope('forward_encoder'):
                forward_encoder_outputs, forward_encoder_state = rnn.dynamic_rnn(
                    encoder_lstm_unit, arg1, self.arg1_len, dtype=tf.float32)
            with tf.variable_scope('backward_encoder'):
                backward_encoder_outputs, backward_encoder_state = rnn.dynamic_rnn(
                    encoder_lstm_unit,
                    tf.reverse_sequence(arg1, tf.cast(self.arg1_len, tf.int64),
                                        1),
                    dtype=tf.float32)
            encoder_outputs = tf.concat(2, [
                forward_encoder_outputs,
                tf.reverse_sequence(backward_encoder_outputs,
                                    tf.cast(self.arg1_len, tf.int64), 1)
            ])
            encoder_state = tf.concat(
                1, [forward_encoder_state, backward_encoder_state])

            source = tf.expand_dims(
                encoder_outputs,
                2)  #batch_size x source_len x 1 x source_depth(2*encoder_size)
            attention_W = tf.get_variable(
                'attention_W',
                [1, 1, 2 * self.encoder_size, self.attention_judge_size])
            attention_V = tf.get_variable('attention_V',
                                          [self.attention_judge_size])
            WxH = tf.nn.conv2d(
                source, attention_W, [1, 1, 1, 1],
                'SAME')  #batch_size x source_len x 1 x attention
            self.mask = tf.placeholder(tf.float32, [None, None])

            def attention(input_t, output_t_minus_1, time):
                with tf.variable_scope('attention'):
                    VxS = tf.reshape(
                        rnn_cell.linear(output_t_minus_1,
                                        self.attention_judge_size, True),
                        [-1, 1, 1, self.attention_judge_size
                         ])  #batch_size x 1 x 1 x attention
                _exp = tf.exp(
                    tf.reduce_sum(attention_V * tf.tanh(WxH + VxS),
                                  [3]))  #batch_size x source_len x 1
                _exp = _exp * tf.expand_dims(self.mask, -1)
                attention_weight = _exp / tf.reduce_sum(_exp, [1],
                                                        keep_dims=True)
                attention_t = tf.reduce_sum(encoder_outputs * attention_weight,
                                            [1])
                feed_in_t = tf.tanh(
                    rnn_cell.linear([attention_t, input_t],
                                    self.embedding_size, True))
                return feed_in_t

            with tf.variable_scope('decoder'):
                decoder_outputs, decoder_state = dynamic_rnn_decoder(
                    arg2,
                    decoder_lstm_unit,
                    initial_state=encoder_state,
                    sequence_length=self.arg2_len,
                    loop_function=attention)
            judge = tf.concat(1, [
                tf.reduce_sum(decoder_outputs, [1]) /
                tf.expand_dims(tf.cast(self.arg2_len, tf.float32), -1),
                tf.reduce_sum(encoder_outputs, [1]) /
                tf.expand_dims(tf.cast(self.arg1_len, tf.float32), -1)
            ])
            unscaled_log_distribution = rnn_cell.linear(
                judge, self.data_loader.sense_vocab_size, True)
            self.output = tf.cast(tf.argmax(unscaled_log_distribution, 1),
                                  tf.int32)
            self.accuracy = tf.reduce_mean(
                tf.cast(tf.equal(self.output, self.sense), tf.float32))

            #max-margin method
            #self._MM = tf.placeholder(tf.int32,[None])
            #margin = tf.sub(tf.reduce_max(unscaled_log_distribution,[1]),tf.gather(tf.reshape(unscaled_log_distribution,[-1]),self._MM))
            #self.loss = tf.reduce_mean(margin)

            #maximum likelihood method
            self.loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    unscaled_log_distribution, self.sense))

            self.optimizer = tf.train.AdagradOptimizer(self.lr)
            self.train_op = self.optimizer.minimize(self.loss)
Пример #16
0
  def prepare_model(self):
    with tf.variable_scope("LSTMTDNN"):
      self.char_inputs = []
      self.word_inputs = []
      self.cnn_outputs = []

      if self.use_char:
        char_W = tf.get_variable("char_embed",
            [self.char_vocab_size, self.char_embed_dim])
      else:
        word_W = tf.get_variable("word_embed",
            [self.word_vocab_size, self.word_embed_dim])

      with tf.variable_scope("CNN") as scope:
        self.char_inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_length, self.max_word_length])
        self.word_inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_length])

        char_indices = tf.split(1, self.seq_length, self.char_inputs)
        word_indices = tf.split(1, self.seq_length, tf.expand_dims(self.word_inputs, -1))

        for idx in xrange(self.seq_length):
          char_index = tf.reshape(char_indices[idx], [-1, self.max_word_length])
          word_index = tf.reshape(word_indices[idx], [-1, 1])

          if idx != 0:
            scope.reuse_variables()

          if self.use_char:
            # [batch_size x word_max_length, char_embed]
            char_embed = tf.nn.embedding_lookup(char_W, char_index)

            char_cnn = TDNN(char_embed, self.char_embed_dim, self.feature_maps, self.kernels)

            if self.use_word:
              word_embed = tf.nn.embedding_lookup(word_W, word_index)
              cnn_output = tf.concat(1, char_cnn.output, word_embed)
            else:
              cnn_output = char_cnn.output
          else:
            cnn_output = tf.squeeze(tf.nn.embedding_lookup(word_W, word_index))

          if self.use_batch_norm:
            bn = batch_norm()
            norm_output = bn(tf.expand_dims(tf.expand_dims(cnn_output, 1), 1))
            cnn_output = tf.squeeze(norm_output)

          if highway:
            #cnn_output = highway(input_, input_dim_length, self.highway_layers, 0)
            cnn_output = highway(cnn_output, cnn_output.get_shape()[1], self.highway_layers, 0)

          self.cnn_outputs.append(cnn_output)

      with tf.variable_scope("LSTM") as scope:
        self.cell = rnn_cell.BasicLSTMCell(self.rnn_size)
        self.stacked_cell = rnn_cell.MultiRNNCell([self.cell] * self.layer_depth)

        outputs, _ = rnn.rnn(self.stacked_cell,
                             self.cnn_outputs,
                             dtype=tf.float32)

        self.lstm_outputs = []
        self.true_outputs = tf.placeholder(tf.float32,
            [self.batch_size, self.seq_length, self.word_vocab_size])

        loss = 0
        true_outputs = tf.split(1, self.seq_length, self.true_outputs)

        for idx, (top_h, true_output) in enumerate(zip(outputs, true_outputs)):
          if self.dropout_prob > 0:
            top_h = tf.nn.dropout(top_h, self.dropout_prob)

          if self.hsm > 0:
            self.lstm_outputs.append(top_h)
          else:
            if idx != 0:
              scope.reuse_variables()
            proj = rnn_cell.linear(top_h, self.word_vocab_size, 0)
            log_softmax = tf.log(tf.nn.softmax(proj))
            self.lstm_outputs.append(log_softmax)

          loss += tf.nn.softmax_cross_entropy_with_logits(self.lstm_outputs[idx],
                                                               tf.squeeze(true_output))

        self.loss = tf.reduce_mean(loss) / self.seq_length

        tf.scalar_summary("loss", self.loss)
        tf.scalar_summary("perplexity", tf.exp(self.loss))
 def __call__(self, inputs, state, scope=None):
   """Most basic RNN: output = new_state = tanh(W * input + U * state + B)."""
   with vs.variable_scope(scope or type(self).__name__):  # "BasicRNNCell"
     output = tf.nn.relu(rnn_cell.linear([inputs, state], self._num_units, True))
   return output, output
Пример #18
0
	def build_model(self):
		with tf.variable_scope('RNNTEST'):
			self.sense = tf.placeholder(tf.int32,[None])
			self.arg1 = tf.placeholder(tf.int32,[None,None,4])
			self.arg2 = tf.placeholder(tf.int32,[None,None,4])
			self.arg1_len = tf.placeholder(tf.int32,[None])
			self.arg2_len = tf.placeholder(tf.int32,[None])
			self.keep_prob = tf.placeholder(tf.float32)

			arg1_list = tf.split(2,4,self.arg1)
			arg2_list = tf.split(2,4,self.arg2)
			
			with tf.device('/cpu:0'):
				NER_W = tf.get_variable('NER_embed',[self.data_loader.NER_vocab_size,self.NER_embed_size]) if self.NER_embed_size>0 else None
				lemma_W = tf.get_variable('lemma_embed',[self.data_loader.lemma_vocab_size,self.lemma_embed_size]) if self.lemma_embed_size>0 else None
				if self.use_pre_trained_embedding:
					word_W = tf.get_variable('word_embed',initializer = tf.convert_to_tensor(self.data_loader.pre_trained_word_embeddings,dtype=tf.float32)) if self.word_embed_size>0 else None
				else:
					word_W = tf.get_variable('word_embed',shape = [self.data_loader.word_vocab_size,self.word_embed_size]) if self.word_embed_size>0 else None
				POS_W = tf.get_variable('POS_embed',[self.data_loader.POS_vocab_size,self.POS_embed_size]) if self.POS_embed_size>0 else None
			arg1_embed_list = []
			arg2_embed_list = []
			for idx,W in enumerate([NER_W,lemma_W,word_W,POS_W]):
				if W is not None:
					arg1_embed_list.append(tf.nn.embedding_lookup(W,tf.squeeze(arg1_list[idx],[2])))
					arg2_embed_list.append(tf.nn.embedding_lookup(W,tf.squeeze(arg2_list[idx],[2])))
			arg1 = tf.nn.dropout(tf.concat(2,arg1_embed_list),self.keep_prob)
			arg2 = tf.nn.dropout(tf.concat(2,arg2_embed_list),self.keep_prob)
			
			encoder_lstm_unit = rnn_cell.BasicLSTMCell(self.encoder_size)
			decoder_lstm_unit = rnn_cell.BasicLSTMCell(self.decoder_size)

			with tf.variable_scope('forward_encoder'):
				forward_encoder_outputs,forward_encoder_state = rnn.dynamic_rnn(encoder_lstm_unit,arg1,self.arg1_len,dtype=tf.float32)
			with tf.variable_scope('backward_encoder'):
				backward_encoder_outputs,backward_encoder_state= rnn.dynamic_rnn(encoder_lstm_unit,tf.reverse_sequence(arg1,tf.cast(self.arg1_len,tf.int64),1),dtype=tf.float32)
			encoder_outputs = tf.concat(2,[forward_encoder_outputs,tf.reverse_sequence(backward_encoder_outputs,tf.cast(self.arg1_len,tf.int64),1)])
			encoder_state = tf.concat(1,[forward_encoder_state,backward_encoder_state])

			source = tf.expand_dims(encoder_outputs,2) #batch_size x source_len x 1 x source_depth(2*encoder_size)
			attention_W = tf.get_variable('attention_W',[1,1,2*self.encoder_size,self.attention_judge_size])
			attention_V = tf.get_variable('attention_V',[self.attention_judge_size])
 			WxH = tf.nn.conv2d(source, attention_W,[1,1,1,1],'SAME') #batch_size x source_len x 1 x attention
 			self.mask = tf.placeholder(tf.float32,[None,None])

			def attention(input_t,output_t_minus_1,time):
				with tf.variable_scope('attention'):
					VxS = tf.reshape(rnn_cell.linear(output_t_minus_1,self.attention_judge_size,True),[-1,1,1,self.attention_judge_size]) #batch_size x 1 x 1 x attention
				_exp = tf.exp(tf.reduce_sum( attention_V * tf.tanh(WxH+VxS), [3]))#batch_size x source_len x 1
				_exp = _exp*tf.expand_dims(self.mask,-1)
				attention_weight = _exp/tf.reduce_sum(_exp,[1], keep_dims=True)
				attention_t = tf.reduce_sum(encoder_outputs*attention_weight,[1])
				feed_in_t = tf.tanh(rnn_cell.linear([attention_t,input_t],self.embedding_size,True))
				return feed_in_t

			with tf.variable_scope('decoder'):
				decoder_outputs,decoder_state = dynamic_rnn_decoder(arg2,decoder_lstm_unit,initial_state=encoder_state,sequence_length=self.arg2_len,loop_function=attention)
			judge = tf.concat(1,[tf.reduce_sum(decoder_outputs,[1])/tf.expand_dims(tf.cast(self.arg2_len,tf.float32),-1),tf.reduce_sum(encoder_outputs,[1])/tf.expand_dims(tf.cast(self.arg1_len,tf.float32),-1)])
			unscaled_log_distribution = rnn_cell.linear(judge,self.data_loader.sense_vocab_size,True)
			self.output = tf.cast(tf.argmax(unscaled_log_distribution,1),tf.int32)
			self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.output,self.sense), tf.float32))
			
			#max-margin method
			#self._MM = tf.placeholder(tf.int32,[None])
			#margin = tf.sub(tf.reduce_max(unscaled_log_distribution,[1]),tf.gather(tf.reshape(unscaled_log_distribution,[-1]),self._MM))
			#self.loss = tf.reduce_mean(margin)

			#maximum likelihood method
			self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(unscaled_log_distribution, self.sense))
			
			self.optimizer = tf.train.AdagradOptimizer(self.lr)
			self.train_op = self.optimizer.minimize(self.loss)