def inference_slot(self):  #intent from sloting filling
     """main computation graph here:  1.prepare decode parameters; 2.decode with attention 3.dropout 4.get logits."""
     # 1. prepare decode parameters
     # decoder_slot_inputs: embeding and split
     decoder_slot_inputs = tf.nn.embedding_lookup(
         self.Embedding_slot_label, self.decoder_slot_input
     )  #[batch_size,self.decoder_sent_length,embed_size]
     decoder_slot_inputs = tf.split(
         decoder_slot_inputs, self.decoder_sent_length, axis=1
     )  # it is a list,length is decoder_sent_length, each element is [batch_size,1,embed_size]
     decoder_slot_inputs = [
         tf.squeeze(x, axis=1) for x in decoder_slot_inputs
     ]  # it is a list,length is decoder_sent_length, each element is [batch_size,embed_size]
     cell = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_size,
                                         state_is_tuple=False)
     output_projection = (self.W_projection_slot, self.b_projection_slot)
     loop_function = extract_argmax_and_embed(
         self.Embedding_slot_label,
         output_projection) if not self.is_training else None
     initial_state = tf.concat(
         [self.bi_state[0][0], self.bi_state[1][0]], -1
     )  #shape:[batch_size,hidden_size*2].last hidden state as initial state.bi_state[0] is forward hidden states; bi_state[1] is backward hidden states
     #tf.layers.dense(self.input_knowledges, self.hidden_size, activation=tf.nn.tanh, use_bias=False) #[None, self.decoder_sent_length,hidden_size]
     # 2. decode with attention
     outputs, final_state = rnn_decoder_with_attention(
         decoder_slot_inputs,
         initial_state,
         cell,
         loop_function,
         self.encode_outputs,
         self.input_knowledges_embedding,
         self.hidden_size,
         scope=None
     )  # A list.length:decoder_sent_length.each element is:[batch_size x output_size]
     self.decoder_slot_computed = tf.stack(
         outputs, axis=1
     )  # decoder_output:[batch_size,decoder_sent_length,hidden_size]
     decoder_output = tf.reshape(
         self.decoder_slot_computed, shape=(-1, self.hidden_size)
     )  # decoder_output:[batch_size*decoder_sent_length,hidden_size]
     #decoder_output=self.layer_normalization(decoder_output) #layer normalization
     # 3. dropout as regularization
     with tf.name_scope("dropout"):  #dropout as regularization
         decoder_output = tf.nn.dropout(
             decoder_output,
             keep_prob=self.dropout_keep_prob)  # shape:[-1,hidden_size]
     # 4. get logits
     with tf.name_scope("output"):
         logits = tf.matmul(
             decoder_output, self.W_projection_slot
         ) + self.b_projection_slot  # logits shape:[batch_size*decoder_sent_length,self.slots_num_classes]==tf.matmul([batch_size*decoder_sent_length,hidden_size*2],[hidden_size*2,self.slots_num_classes])
         logits = tf.reshape(
             logits,
             shape=(self.batch_size, self.decoder_sent_length,
                    self.slots_num_classes)
         )  #logits shape:[batch_size,decoder_sent_length,self.slots_num_classes]
     print("inference_slot.logits:", logits)
     return logits
    def inference(self):
        """main computation graph here:
        #1.Word embedding. 2.Encoder with GRU 3.Decoder using GRU(optional with attention)."""
        ###################################################################################################################################
        # 1.embedding of words
        self.embedded_words = tf.nn.embedding_lookup(self.Embedding,self.input_x)  #[None, self.sequence_length, self.embed_size]
        # 2.encoder with GRU
        # 2.1 forward gru
        hidden_state_forward_list = self.gru_forward(self.embedded_words,self.gru_cell)  # a list,length is sentence_length, each element is [batch_size,hidden_size]
        # 2.2 backward gru
        hidden_state_backward_list = self.gru_forward(self.embedded_words,self.gru_cell,reverse=True)  # a list,length is sentence_length, each element is [batch_size*num_sentences,hidden_size]
        # 2.3 concat forward hidden state and backward hidden state. hidden_state: a list.len:sentence_length,element:[batch_size*num_sentences,hidden_size*2]
        thought_vector_list=[tf.concat([h_forward,h_backward],axis=1) for h_forward,h_backward in zip(hidden_state_forward_list,hidden_state_backward_list)]#list,len:sent_len,e:[batch_size,hidden_size*2]

        # 3.Decoder using GRU with attention
        thought_vector=tf.stack(thought_vector_list,axis=1) #shape:[batch_size,sentence_length,hidden_size*2]
        #initial_state=tf.reduce_sum(thought_vector,axis=1) #[batch_size,hidden_size*2] #TODO NEED TO TEST WHICH ONE IS BETTER: SUM UP OR USE LAST HIDDEN STATE==>similiarity.
        initial_state=tf.nn.tanh(tf.matmul(hidden_state_backward_list[0],self.W_initial_state)+self.b_initial_state) #initial_state:[batch_size,hidden_size*2]. TODO this is follow paper's way.
        cell=self.gru_cell_decoder #this is a special cell. because it beside previous hidden state, current input, it also has a context vecotor, which represent attention result.

        output_projection=(self.W_projection,self.b_projection) #W_projection:[self.hidden_size * 2, self.num_classes]; b_projection:[self.num_classes]
        loop_function = extract_argmax_and_embed(self.Embedding_label,output_projection) if not self.is_training else None #loop function will be used only at testing, not training.
        attention_states=thought_vector #[None, self.sequence_length, self.embed_size]
        decoder_input_embedded=tf.nn.embedding_lookup(self.Embedding_label,self.decoder_input) #[batch_size,self.decoder_sent_length,embed_size]
        decoder_input_splitted = tf.split(decoder_input_embedded, self.decoder_sent_length,axis=1)  # it is a list,length is decoder_sent_length, each element is [batch_size,1,embed_size]
        decoder_input_squeezed = [tf.squeeze(x, axis=1) for x in decoder_input_splitted]  # it is a list,length is decoder_sent_length, each element is [batch_size,embed_size]

        #rnn_decoder_with_attention(decoder_inputs, initial_state, cell, loop_function,attention_states,scope=None):
            #input1:decoder_inputs:target, shift by one. for example.the target is:"X Y Z",then decoder_inputs should be:"START X Y Z" A list of 2D Tensors [batch_size x input_size].
            #input2:initial_state: 2D Tensor with shape  [batch_size x cell.state_size].
            #input3:attention_states:represent X. 3D Tensor [batch_size x attn_length x attn_size].
            #output:?
        outputs, final_state=rnn_decoder_with_attention(decoder_input_squeezed, initial_state, cell, loop_function, attention_states, scope=None) # A list.length:decoder_sent_length.each element is:[batch_size x output_size]
        decoder_output=tf.stack(outputs,axis=1) #decoder_output:[batch_size,decoder_sent_length,hidden_size*2]
        decoder_output=tf.reshape(decoder_output,shape=(-1,self.hidden_size*2)) #decoder_output:[batch_size*decoder_sent_length,hidden_size*2]

        with tf.name_scope("dropout"):
            decoder_output = tf.nn.dropout(decoder_output,keep_prob=self.dropout_keep_prob)  # shape:[None,hidden_size*4]
        # 4. get logits
        with tf.name_scope("output"):
            logits = tf.matmul(decoder_output, self.W_projection) + self.b_projection  # logits shape:[batch_size*decoder_sent_length,self.num_classes]==tf.matmul([batch_size*decoder_sent_length,hidden_size*2],[hidden_size*2,self.num_classes])
            logits=tf.reshape(logits,shape=(self.batch_size,self.decoder_sent_length,self.num_classes)) #logits shape:[batch_size,decoder_sent_length,self.num_classes]
        ###################################################################################################################################
        return logits
    def inference(self):
        """main computation graph here:
        #1.Word embedding. 2.Encoder with GRU 3.Decoder using GRU(optional with attention)."""
        ###################################################################################################################################
        # 1.embedding of words
        self.embedded_words = tf.nn.embedding_lookup(self.Embedding,self.input_x)  #[None, self.sequence_length, self.embed_size]
        # 2.encoder with GRU
        # 2.1 forward gru
        hidden_state_forward_list = self.gru_forward(self.embedded_words,self.gru_cell)  # a list,length is sentence_length, each element is [batch_size,hidden_size]
        # 2.2 backward gru
        hidden_state_backward_list = self.gru_forward(self.embedded_words,self.gru_cell,reverse=True)  # a list,length is sentence_length, each element is [batch_size*num_sentences,hidden_size]
        # 2.3 concat forward hidden state and backward hidden state. hidden_state: a list.len:sentence_length,element:[batch_size*num_sentences,hidden_size*2]
        thought_vector_list=[tf.concat([h_forward,h_backward],axis=1) for h_forward,h_backward in zip(hidden_state_forward_list,hidden_state_backward_list)]#list,len:sent_len,e:[batch_size,hidden_size*2]

        # 3.Decoder using GRU with attention
        thought_vector=tf.stack(thought_vector_list,axis=1) #shape:[batch_size,sentence_length,hidden_size*2]
        #initial_state=tf.reduce_sum(thought_vector,axis=1) #[batch_size,hidden_size*2] #TODO NEED TO TEST WHICH ONE IS BETTER: SUM UP OR USE LAST HIDDEN STATE==>similiarity.
        initial_state=tf.nn.tanh(tf.matmul(hidden_state_backward_list[0],self.W_initial_state)+self.b_initial_state) #initial_state:[batch_size,hidden_size*2]. TODO this is follow paper's way.
        cell=self.gru_cell_decoder #TODO gru_cell_decoder #this is a special cell. because it beside previous hidden state, current input, it also has a context vecotor, which represent attention result.

        output_projection=(self.W_projection,self.b_projection) #W_projection:[self.hidden_size * 2, self.num_classes]; b_projection:[self.num_classes]
        loop_function = extract_argmax_and_embed(self.Embedding_label,output_projection) if not self.is_training else None #loop function will be used only at testing, not training.
        attention_states=thought_vector #[None, self.sequence_length, self.embed_size]
        decoder_input_embedded=tf.nn.embedding_lookup(self.Embedding_label,self.decoder_input) #[batch_size,self.decoder_sent_length,embed_size]
        decoder_input_splitted = tf.split(decoder_input_embedded, self.decoder_sent_length,axis=1)  # it is a list,length is decoder_sent_length, each element is [batch_size,1,embed_size]
        decoder_input_squeezed = [tf.squeeze(x, axis=1) for x in decoder_input_splitted]  # it is a list,length is decoder_sent_length, each element is [batch_size,embed_size]

        outputs, final_state = rnn_decoder_with_attention(decoder_input_squeezed, initial_state, cell,loop_function, attention_states,scope=None)  # A list.length:decoder_sent_length.each element is:[batch_size x output_size]
        decoder_output = tf.stack(outputs, axis=1)  # decoder_output:[batch_size,decoder_sent_length,hidden_size*2]
        decoder_output = tf.reshape(decoder_output, shape=(-1, self.hidden_size * 2))  # decoder_output:[batch_size*decoder_sent_length,hidden_size*2]


        with tf.name_scope("dropout"):
            decoder_output = tf.nn.dropout(decoder_output,keep_prob=self.dropout_keep_prob)  # shape:[None,hidden_size*4]
        # 4. get logits
        with tf.name_scope("output"):
            logits = tf.matmul(decoder_output, self.W_projection) + self.b_projection  # logits shape:[batch_size*decoder_sent_length,self.num_classes]==tf.matmul([batch_size*decoder_sent_length,hidden_size*2],[hidden_size*2,self.num_classes])
            logits=tf.reshape(logits,shape=(self.batch_size,self.decoder_sent_length,self.num_classes)) #logits shape:[batch_size,decoder_sent_length,self.num_classes]
        ###################################################################################################################################
        return logits
    def inference(self):
        """main computation graph here: 1.Word embedding. 2.Encoder with GRU 3.Decoder using GRU(optional with attention)."""
        # 1.word embedding
        embedded_words = tf.nn.embedding_lookup(
            self.Embedding,
            self.input_x)  # [None, self.sequence_length, self.embed_size]

        # 2.encode with bi-directional GRU
        fw_cell = tf.nn.rnn_cell.BasicLSTMCell(
            self.hidden_size, state_is_tuple=True)  #rnn_cell.LSTMCell
        bw_cell = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_size,
                                               state_is_tuple=True)

        #fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, output_keep_prob=self.dropout_keep_prob)
        #bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob=self.dropout_keep_prob)
        bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(
            fw_cell,
            bw_cell,
            embedded_words,
            dtype=tf.
            float32,  #sequence_length: size `[batch_size]`,containing the actual lengths for each of the sequences in the batch
            sequence_length=self.sequence_length_batch,
            time_major=False,
            swap_memory=True)
        encode_outputs = tf.concat(
            [bi_outputs[0], bi_outputs[1]],
            -1)  #should be:[None, self.sequence_length,self.hidden_size*2]

        # 3. decode with attention
        # decoder_inputs: embeding and split
        decoder_inputs = tf.nn.embedding_lookup(
            self.Embedding_label, self.decoder_input
        )  #[batch_size,self.decoder_sent_length,embed_size]
        decoder_inputs = tf.split(
            decoder_inputs, self.decoder_sent_length, axis=1
        )  # it is a list,length is decoder_sent_length, each element is [batch_size,1,embed_size]
        decoder_inputs = [
            tf.squeeze(x, axis=1) for x in decoder_inputs
        ]  # it is a list,length is decoder_sent_length, each element is [batch_size,embed_size]
        cell = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_size,
                                            state_is_tuple=False)
        output_projection = (self.W_projection, self.b_projection)
        loop_function = extract_argmax_and_embed(
            self.Embedding_label,
            output_projection) if not self.is_training else None
        initial_state = tf.concat(
            [bi_state[0][0], bi_state[1][0]], -1
        )  #shape:[batch_size,hidden_size*2].last hidden state as initial state.bi_state[0] is forward hidden states; bi_state[1] is backward hidden states
        outputs, final_state = rnn_decoder_with_attention(
            decoder_inputs,
            initial_state,
            cell,
            loop_function,
            encode_outputs,
            self.hidden_size,
            scope=None
        )  # A list.length:decoder_sent_length.each element is:[batch_size x output_size]
        decoder_output = tf.stack(
            outputs, axis=1
        )  # decoder_output:[batch_size,decoder_sent_length,hidden_size]
        decoder_output = tf.reshape(
            decoder_output, shape=(-1, self.hidden_size)
        )  # decoder_output:[batch_size*decoder_sent_length,hidden_size]

        #decoder_output=self.layer_normalization(decoder_output) #layer normalization

        with tf.name_scope("dropout"):  #dropout as regularization
            decoder_output = tf.nn.dropout(
                decoder_output,
                keep_prob=self.dropout_keep_prob)  # shape:[-1,hidden_size]
        # 4. get logits
        with tf.name_scope("output"):
            print("###decoder_output:", decoder_output
                  )  # <tf.Tensor 'dropout/dropout/mul:0' shape=(12, 1000)
            logits = tf.matmul(
                decoder_output, self.W_projection
            ) + self.b_projection  # logits shape:[batch_size*decoder_sent_length,self.num_classes]==tf.matmul([batch_size*decoder_sent_length,hidden_size*2],[hidden_size*2,self.num_classes])
            logits = tf.reshape(
                logits,
                shape=(self.batch_size, self.decoder_sent_length,
                       self.num_classes)
            )  #logits shape:[batch_size,decoder_sent_length,self.num_classes]
        return logits