Exemplo n.º 1
0
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.attention_size = 128
        self.mlp_size = self.dim
        self.sequence_length = seq_length
        self.lam = 0.01
        self.epsilon = 1e-10

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 4, self.mlp_size], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.mlp_size], stddev=0.1))

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, prem_mask = blocks.length(self.premise_x)
        hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x)

        ### BiLSTM layer ###
        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        ############# MY CODE STARTS ########

        premise_outs, premise_final = blocks.biLSTM(premise_in,
                                                    dim=self.dim,
                                                    seq_len=prem_seq_lengths,
                                                    name='premise')
        attention_outs_pre, self.alphas_pre = blocks.attention(
            premise_outs,
            self.attention_size,
            return_alphas=True,
            mask=tf.squeeze(prem_mask))
        drop_pre = tf.nn.dropout(attention_outs_pre, self.keep_rate_ph)
        #drop_pre = attention_outs_pre

        hypothesis_outs, hypothesis_final = blocks.biLSTM(
            hypothesis_in,
            dim=self.dim,
            seq_len=hyp_seq_lengths,
            name='hypothesis')
        attention_outs_hyp, self.alphas_hyp = blocks.attention(
            hypothesis_outs,
            self.attention_size,
            return_alphas=True,
            mask=tf.squeeze(hyp_mask))
        drop_hyp = tf.nn.dropout(attention_outs_hyp, self.keep_rate_ph)
        #drop_hyp = attention_outs_hyp

        # Concat output of pre and hyp outpuratet
        drop = tf.concat([drop_pre, drop_hyp], axis=1)
        h_mlp = tf.nn.relu(tf.matmul(drop, self.W_mlp) + self.b_mlp)

        ############# MY CODE ENDS ########

        ############# Hex Part #########
        ############  MY CODE STARTS #########

        attention_outs_pre_hex, self.alphas_pre_hex = blocks.attention(
            premise_outs,
            self.attention_size,
            return_alphas=True,
            mask=tf.squeeze(prem_mask))
        drop_pre_hex = tf.nn.dropout(attention_outs_pre_hex, self.keep_rate_ph)
        #drop_pre = attention_outs_pre

        attention_outs_hyp_hex, self.alphas_hyp_hex = blocks.attention(
            hypothesis_outs,
            self.attention_size,
            return_alphas=True,
            mask=tf.squeeze(hyp_mask))
        drop_hyp_hex = tf.nn.dropout(attention_outs_hyp_hex, self.keep_rate_ph)
        #drop_hyp = attention_outs_hyp

        # Concat output of pre and hyp outpuratet
        bag_of_word_in = tf.concat([drop_pre_hex, drop_hyp_hex], axis=1)

        # Hex component inputs

        h_fc1 = h_mlp  # (?, 300)

        h_fc2 = bag_of_word_in  # (?, 1200)

        # Hex layer definition
        self.W_cl_1 = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.W_cl_2 = tf.Variable(tf.random_normal([1200, 3]), trainable=True)
        self.b_cl = tf.Variable(tf.random_normal((3, )), trainable=True)
        self.W_cl = tf.concat([self.W_cl_1, self.W_cl_2], 0)

        # Compute prediction using  [h_fc1, 0(pad)]
        pad = tf.zeros_like(h_fc2, tf.float32)
        # print(pad.shape) -> (?, 600)

        yconv_contact_pred = tf.nn.dropout(tf.concat([h_fc1, pad], 1),
                                           self.keep_rate_ph)
        y_conv_pred = tf.matmul(yconv_contact_pred, self.W_cl) + self.b_cl

        self.logits = y_conv_pred  # Prediction

        # Compute loss using [h_fc1, h_fc2] and [0(pad2), h_fc2]
        pad2 = tf.zeros_like(h_fc1, tf.float32)

        yconv_contact_H = tf.nn.dropout(tf.concat([pad2, h_fc2], 1),
                                        self.keep_rate_ph)
        y_conv_H = tf.matmul(yconv_contact_H, self.W_cl) + self.b_cl  # get Fg

        yconv_contact_loss = tf.nn.dropout(tf.concat([h_fc1, h_fc2], 1),
                                           self.keep_rate_ph)
        y_conv_loss = tf.matmul(yconv_contact_loss,
                                self.W_cl) + self.b_cl  # get Fb

        self.temp = y_conv_H
        temp = tf.matmul(y_conv_H, y_conv_H, transpose_a=True)

        y_conv_loss = y_conv_loss - tf.matmul(
            tf.matmul(tf.matmul(y_conv_H, tf.matrix_inverse(temp)),
                      y_conv_H,
                      transpose_b=True), y_conv_loss)  # get loss

        cost_logits = y_conv_loss

        # Regularize hex attention
        alphas_pre_loss_hex = self.alphas_pre_hex + self.epsilon
        alphas_hyp_loss_hex = self.alphas_hyp_hex + self.epsilon
        reg1 = tf.reduce_mean(-tf.reduce_sum(
            alphas_pre_loss_hex * tf.log(alphas_pre_loss_hex), axis=1))
        reg2 = tf.reduce_mean(-tf.reduce_sum(
            alphas_hyp_loss_hex * tf.log(alphas_hyp_loss_hex), axis=1))
        reg = reg1 + reg2

        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y,
                                                           logits=y_conv_loss))
Exemplo n.º 2
0
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.attention_size = 128
        self.mlp_size = self.dim
        self.sequence_length = seq_length
        self.lam = 0.01
        self.epsilon = 1e-10

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 4, self.mlp_size], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.mlp_size], stddev=0.1))

        self.W_cl = tf.Variable(
            tf.random_normal([self.mlp_size, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, prem_mask = blocks.length(self.premise_x)
        hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x)

        ### BiLSTM layer ###
        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        ############# MY CODE STARTS ########

        premise_outs, premise_final = blocks.biLSTM(premise_in,
                                                    dim=self.dim,
                                                    seq_len=prem_seq_lengths,
                                                    name='premise')
        attention_outs_pre, self.alphas_pre = blocks.attention(
            premise_outs, self.attention_size, return_alphas=True)
        drop_pre = tf.nn.dropout(attention_outs_pre, self.keep_rate_ph)
        #drop_pre = attention_outs_pre

        hypothesis_outs, hypothesis_final = blocks.biLSTM(
            hypothesis_in,
            dim=self.dim,
            seq_len=hyp_seq_lengths,
            name='hypothesis')
        attention_outs_hyp, self.alphas_hyp = blocks.attention(
            hypothesis_outs, self.attention_size, return_alphas=True)
        drop_hyp = tf.nn.dropout(attention_outs_hyp, self.keep_rate_ph)
        #drop_hyp = attention_outs_hyp

        # Concat output of pre and hyp outpuratet
        drop = tf.concat([drop_pre, drop_hyp], axis=1)

        # Add a small constant
        alphas_pre_loss = self.alphas_pre * tf.squeeze(
            prem_mask) + self.epsilon
        alphas_hyp_loss = self.alphas_hyp * tf.squeeze(hyp_mask) + self.epsilon

        # Calculate entropy
        reg1 = tf.reduce_mean(
            -tf.reduce_sum(alphas_pre_loss * tf.log(alphas_pre_loss), axis=1))
        reg2 = tf.reduce_mean(
            -tf.reduce_sum(alphas_hyp_loss * tf.log(alphas_hyp_loss), axis=1))
        reg = reg1 + reg2

        # MLP layer
        h_mlp = tf.nn.relu(tf.matmul(drop, self.W_mlp) + self.b_mlp)

        ############# MY CODE ENDS ########

        # Get prediction
        self.logits = tf.matmul(h_mlp, self.W_cl) + self.b_cl

        # Define the cost function
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.y, logits=self.logits) + self.lam * reg)
Exemplo n.º 3
0
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        lambd = 0.05

        ## note: embedding_dim and hidden_dim are both 300, used interchangeably
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 8, self.dim], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1))

        self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, mask_prem = blocks.length(self.premise_x)
        hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x)

        ### First biLSTM layer ###

        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        premise_outs, c1 = blocks.biLSTM(premise_in,
                                         dim=self.dim,
                                         seq_len=prem_seq_lengths,
                                         name='premise')
        hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in,
                                            dim=self.dim,
                                            seq_len=hyp_seq_lengths,
                                            name='hypothesis')

        premise_bi = tf.concat(premise_outs, axis=2)
        hypothesis_bi = tf.concat(hypothesis_outs, axis=2)

        premise_list = tf.unstack(premise_bi, axis=1)
        hypothesis_list = tf.unstack(hypothesis_bi, axis=1)

        ### Attention ###

        scores_all = []
        premise_attn = []
        alphas = []

        for i in range(self.sequence_length):

            scores_i_list = []
            for j in range(self.sequence_length):
                score_ij = tf.reduce_sum(tf.multiply(premise_list[i],
                                                     hypothesis_list[j]),
                                         1,
                                         keep_dims=True)
                scores_i_list.append(score_ij)

            scores_i = tf.stack(scores_i_list, axis=1)
            alpha_i = blocks.masked_softmax(scores_i, mask_hyp)
            a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1)
            premise_attn.append(a_tilde_i)

            scores_all.append(scores_i)
            alphas.append(alpha_i)

        scores_stack = tf.stack(scores_all, axis=2)
        scores_list = tf.unstack(scores_stack, axis=1)

        hypothesis_attn = []
        betas = []
        for j in range(self.sequence_length):
            scores_j = scores_list[j]
            beta_j = blocks.masked_softmax(scores_j, mask_prem)
            b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1)
            hypothesis_attn.append(b_tilde_j)

            betas.append(beta_j)

        # Make attention-weighted sentence representations into one tensor,
        premise_attns = tf.stack(premise_attn, axis=1)
        hypothesis_attns = tf.stack(hypothesis_attn, axis=1)

        # For making attention plots,
        self.alpha_s = tf.stack(alphas, axis=2)
        self.beta_s = tf.stack(betas, axis=2)

        ### Subcomponent Inference ###

        prem_diff = tf.subtract(premise_bi, premise_attns)
        prem_mul = tf.multiply(premise_bi, premise_attns)
        hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns)
        hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns)

        m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2)
        m_b = tf.concat([hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul],
                        2)

        ### Inference Composition ###

        v1_outs, c3 = blocks.biLSTM(m_a,
                                    dim=self.dim,
                                    seq_len=prem_seq_lengths,
                                    name='v1')
        v2_outs, c4 = blocks.biLSTM(m_b,
                                    dim=self.dim,
                                    seq_len=hyp_seq_lengths,
                                    name='v2')

        v1_bi = tf.concat(v1_outs, axis=2)
        v2_bi = tf.concat(v2_outs, axis=2)

        ### Pooling Layer ###

        v_1_sum = tf.reduce_sum(v1_bi, 1)
        v_1_ave = tf.div(
            v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1))

        v_2_sum = tf.reduce_sum(v2_bi, 1)
        v_2_ave = tf.div(
            v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1))

        v_1_max = tf.reduce_max(v1_bi, 1)
        v_2_max = tf.reduce_max(v2_bi, 1)

        v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1)

        # MLP layer
        h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp)

        ############### MY CODE STARTS #####

        # Define layer size
        self.bow_layer_size = 600

        # LSTM layer (final layer of the original esim model)
        h_fc1 = h_mlp

        # Bag-of-word input (averaging word embeddings)
        bow_pre = premise_in
        bow_hyp = hypothesis_in
        # print(bow_pre.shape) -> (?, 50, 300)
        bag_of_word_pre = tf.reduce_mean(bow_pre, 1)
        bag_of_word_hyp = tf.reduce_mean(bow_hyp, 1)
        # print(bag_of_word_pre.shape) -> (?, 300)
        bag_of_word_in = tf.concat([bag_of_word_pre, bag_of_word_hyp], 1)
        # print(bag_of_word_in.shape) -> (?, 600)

        # Bag-of-word input layer params
        h_fc2 = bag_of_word_in
        # print( h_fc2.shape) -> (?, 600)

        # Bag-of-word output layer params
        weights_from_split = np.load(
            "../../rearrangingDS/rearranged_even_seqlen50/weights.npy")
        # (600, 3)
        bias_from_split = np.load(
            "../../rearrangingDS/rearranged_even_seqlen50/bias.npy")
        # (3,)

        self.W_cl_1 = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.W_cl_2 = tf.Variable(tf.random_normal([600, 3]), trainable=True)
        self.b_cl = tf.Variable(tf.random_normal((3, )), trainable=True)
        self.W_cl = tf.concat([self.W_cl_1, self.W_cl_2], 0)

        reg = lambd * tf.reduce_sum(tf.abs(self.W_cl_2)) / (2 * 50)

        # Compute prediction using  [h_fc1, 0(pad)]
        pad = tf.zeros_like(h_fc2, tf.float32)
        # print(pad.shape) -> (?, 600)

        yconv_contact_pred = tf.nn.dropout(tf.concat([h_fc1, pad], 1),
                                           self.keep_rate_ph)
        y_conv_pred = tf.matmul(yconv_contact_pred, self.W_cl) + self.b_cl

        self.logits = y_conv_pred  # Prediction

        # Compute loss using [h_fc1, h_fc2] and [0(pad2), h_fc2]
        pad2 = tf.zeros_like(h_fc1, tf.float32)

        yconv_contact_H = tf.nn.dropout(tf.concat([pad2, h_fc2], 1),
                                        self.keep_rate_ph)
        y_conv_H = tf.matmul(yconv_contact_H, self.W_cl) + self.b_cl  # get Fg

        yconv_contact_loss = tf.nn.dropout(tf.concat([h_fc1, h_fc2], 1),
                                           self.keep_rate_ph)
        y_conv_loss = tf.matmul(yconv_contact_loss,
                                self.W_cl) + self.b_cl  # get Fb

        y_conv_loss = y_conv_loss - tf.matmul(
            tf.matmul(tf.matmul(
                y_conv_H,
                tf.matrix_inverse(
                    tf.matmul(y_conv_H, y_conv_H, transpose_a=True))),
                      y_conv_H,
                      transpose_b=True), y_conv_loss)  # get loss

        cost_logits = y_conv_loss
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.y, logits=y_conv_loss)) + reg  # Cost
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 8, self.dim], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1))

        self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, mask_prem = blocks.length(self.premise_x)
        hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x)

        ### First biLSTM layer ###

        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        premise_outs, c1 = blocks.biLSTM(premise_in,
                                         dim=self.dim,
                                         seq_len=prem_seq_lengths,
                                         name='premise')
        hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in,
                                            dim=self.dim,
                                            seq_len=hyp_seq_lengths,
                                            name='hypothesis')

        premise_bi = tf.concat(premise_outs, axis=2)
        hypothesis_bi = tf.concat(hypothesis_outs, axis=2)

        premise_list = tf.unstack(premise_bi, axis=1)
        hypothesis_list = tf.unstack(hypothesis_bi, axis=1)

        ### Attention ###

        scores_all = []
        premise_attn = []
        alphas = []

        for i in range(self.sequence_length):

            scores_i_list = []
            for j in range(self.sequence_length):
                score_ij = tf.reduce_sum(tf.multiply(premise_list[i],
                                                     hypothesis_list[j]),
                                         1,
                                         keep_dims=True)
                scores_i_list.append(score_ij)

            scores_i = tf.stack(scores_i_list, axis=1)
            alpha_i = blocks.masked_softmax(scores_i, mask_hyp)
            a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1)
            premise_attn.append(a_tilde_i)

            scores_all.append(scores_i)
            alphas.append(alpha_i)

        scores_stack = tf.stack(scores_all, axis=2)
        scores_list = tf.unstack(scores_stack, axis=1)

        hypothesis_attn = []
        betas = []
        for j in range(self.sequence_length):
            scores_j = scores_list[j]
            beta_j = blocks.masked_softmax(scores_j, mask_prem)
            b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1)
            hypothesis_attn.append(b_tilde_j)

            betas.append(beta_j)

        # Make attention-weighted sentence representations into one tensor,
        premise_attns = tf.stack(premise_attn, axis=1)
        hypothesis_attns = tf.stack(hypothesis_attn, axis=1)

        # For making attention plots,
        self.alpha_s = tf.stack(alphas, axis=2)
        self.beta_s = tf.stack(betas, axis=2)

        ### Subcomponent Inference ###

        prem_diff = tf.subtract(premise_bi, premise_attns)
        prem_mul = tf.multiply(premise_bi, premise_attns)
        hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns)
        hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns)

        m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2)
        m_b = tf.concat([hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul],
                        2)

        ### Inference Composition ###

        v1_outs, c3 = blocks.biLSTM(m_a,
                                    dim=self.dim,
                                    seq_len=prem_seq_lengths,
                                    name='v1')
        v2_outs, c4 = blocks.biLSTM(m_b,
                                    dim=self.dim,
                                    seq_len=hyp_seq_lengths,
                                    name='v2')

        v1_bi = tf.concat(v1_outs, axis=2)
        v2_bi = tf.concat(v2_outs, axis=2)

        ### Pooling Layer ###

        v_1_sum = tf.reduce_sum(v1_bi, 1)
        v_1_ave = tf.div(
            v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1))

        v_2_sum = tf.reduce_sum(v2_bi, 1)
        v_2_ave = tf.div(
            v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1))

        v_1_max = tf.reduce_max(v1_bi, 1)
        v_2_max = tf.reduce_max(v2_bi, 1)

        v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1)

        # MLP layer
        h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp)

        # Dropout applied to classifier
        h_drop = tf.nn.dropout(h_mlp, self.keep_rate_ph)

        # Get prediction
        self.logits = tf.matmul(h_drop, self.W_cl) + self.b_cl

        # Define the cost function
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y,
                                                           logits=self.logits))
Exemplo n.º 5
0
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.premise_pos = tf.placeholder(tf.int32,
                                          [None, self.sequence_length, 47],
                                          name='premise_pos')
        self.hypothesis_pos = tf.placeholder(tf.int32,
                                             [None, self.sequence_length, 47],
                                             name='hypothesis_pos')
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 12, self.dim], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1))

        self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        # ## Define External Knowledge dictionary para.
        # self.exterKnowledge_dic = exterKnowledge_dic
        ## Define R_matrix
        self.R_mat = tf.placeholder(
            tf.float32, [None, self.sequence_length, self.sequence_length])

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, mask_prem = blocks.length(self.premise_x)
        hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x)

        ### First biLSTM layer ###

        premise_in = tf.concat(
            [emb_drop(self.premise_x),
             tf.cast(self.premise_pos, tf.float32)],
            axis=2)
        hypothesis_in = tf.concat([
            emb_drop(self.hypothesis_x),
            tf.cast(self.hypothesis_pos, tf.float32)
        ],
                                  axis=2)

        premise_outs, c1 = blocks.biLSTM(premise_in,
                                         dim=self.dim,
                                         seq_len=prem_seq_lengths,
                                         name='premise')
        hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in,
                                            dim=self.dim,
                                            seq_len=hyp_seq_lengths,
                                            name='hypothesis')

        premise_bi = tf.concat(premise_outs, axis=2)
        hypothesis_bi = tf.concat(hypothesis_outs, axis=2)

        premise_list = tf.unstack(premise_bi, axis=1)
        hypothesis_list = tf.unstack(hypothesis_bi, axis=1)

        ### self-attention ###
        premise_project = blocks.dense(premise_bi, 600)
        premise_project_list = tf.unstack(premise_project, axis=1)
        premise_self_attn = []
        alphas = []

        for i in range(self.sequence_length):
            scores_i_list = []
            for j in range(self.sequence_length):
                score_ij = tf.reduce_sum(tf.multiply(premise_project_list[i],
                                                     premise_project_list[j]),
                                         1,
                                         keep_dims=True)
                scores_i_list.append(score_ij)
            scores_i = tf.stack(scores_i_list, axis=1)
            alpha_i = blocks.masked_softmax(scores_i, mask_prem)
            p_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, premise_bi), 1)
            premise_self_attn.append(p_tilde_i)

        hypothesis_project = blocks.dense(hypothesis_bi, 600)
        hypothesis_project_list = tf.unstack(hypothesis_project, axis=1)
        hypothesis_self_attn = []
        for i in range(self.sequence_length):
            scores_i_list = []
            for j in range(self.sequence_length):
                score_ij = tf.reduce_sum(tf.multiply(
                    hypothesis_project_list[i], hypothesis_project_list[j]),
                                         1,
                                         keep_dims=True)
                scores_i_list.append(score_ij)
            scores_i = tf.stack(scores_i_list, axis=1)
            beta_i = blocks.masked_softmax(scores_i, mask_hyp)
            h_tilde_i = tf.reduce_sum(tf.multiply(beta_i, hypothesis_bi), 1)
            hypothesis_self_attn.append(h_tilde_i)

        premise_self_attns = tf.stack(premise_self_attn, axis=1)
        hypothesis_self_attns = tf.stack(hypothesis_self_attn, axis=1)

        ### Attention ###

        scores_all = []
        premise_attn = []
        alphas = []
        r_alpha = []
        r_all = []

        for i in range(self.sequence_length):
            scores_i_list = []
            r_i_list = []
            for j in range(self.sequence_length):
                #caculate similarity score_ij (e_ij)

                score_ij_ori = tf.reduce_sum(tf.multiply(
                    premise_list[i], hypothesis_list[j]),
                                             1,
                                             keep_dims=True)
                ext_r = tf.expand_dims(self.R_mat[:, i, j], axis=1)
                score_ij = score_ij_ori + ext_r
                scores_i_list.append(score_ij)
                r_ij = self.R_mat[:, i, j]
                r_i_list.append(r_ij)
                #pdb.set_trace()
            scores_i = tf.stack(scores_i_list, axis=1)
            r_i = tf.expand_dims(tf.stack(r_i_list, axis=1), 2)
            #alpha_i: weigth of hypothesis_bi
            alpha_i = blocks.masked_softmax(scores_i, mask_hyp)
            a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1)
            premise_attn.append(a_tilde_i)

            r_alpha_i = tf.reduce_sum(tf.multiply(r_i, alpha_i), 1)

            scores_all.append(scores_i)
            alphas.append(alpha_i)
            r_alpha.append(r_alpha_i)
            r_all.append(r_i)

        scores_stack = tf.stack(scores_all, axis=2)
        scores_list = tf.unstack(scores_stack,
                                 axis=1)  #turn i index to j index

        r_stack = tf.stack(r_all, axis=2)
        r_list = tf.unstack(r_stack, axis=1)  #turn i index to j index

        hypothesis_attn = []
        betas = []
        r_beta = []
        for j in range(self.sequence_length):
            scores_j = scores_list[j]
            beta_j = blocks.masked_softmax(scores_j, mask_prem)
            b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1)
            hypothesis_attn.append(b_tilde_j)

            r_j = r_list[j]
            r_beta_j = tf.reduce_sum(tf.multiply(r_j, beta_j), 1)
            r_beta.append(r_beta_j)

            betas.append(beta_j)
        # Make r_alpha and r_beta in tensor
        r_alphas = tf.stack(r_alpha, axis=1)
        r_betas = tf.stack(r_beta, axis=1)

        # Make attention-weighted sentence representations into one tensor,
        premise_attns = tf.stack(premise_attn, axis=1)
        hypothesis_attns = tf.stack(hypothesis_attn, axis=1)

        # For making attention plots,
        self.alpha_s = tf.stack(alphas, axis=2)
        self.beta_s = tf.stack(betas, axis=2)

        ### Subcomponent Inference ###
        prem_self_diff = tf.subtract(premise_bi, premise_self_attns)
        prem_self_mul = tf.multiply(premise_bi, premise_self_attns)
        hyp_self_diff = tf.subtract(hypothesis_bi, hypothesis_self_attns)
        hyp_self_mul = tf.multiply(hypothesis_bi, hypothesis_self_attns)

        prem_diff = tf.subtract(premise_bi, premise_attns)
        prem_mul = tf.multiply(premise_bi, premise_attns)
        hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns)
        hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns)

        ### Factorize Machine ###

        FM_premise_self_attns = tf.expand_dims(
            blocks.factorize_machine(
                tf.concat([premise_bi, premise_self_attns], 2)), 2)
        FM_prem_self_diff = tf.expand_dims(
            blocks.factorize_machine(prem_self_diff), 2)
        FM_prem_self_mul = tf.expand_dims(
            blocks.factorize_machine(prem_self_mul), 2)

        FM_hypothesis_self_attns = tf.expand_dims(
            blocks.factorize_machine(
                tf.concat([hypothesis_bi, hypothesis_self_attns], 2)), 2)
        FM_hyp_self_diff = tf.expand_dims(
            blocks.factorize_machine(hyp_self_diff), 2)
        FM_hyp_self_mul = tf.expand_dims(
            blocks.factorize_machine(hyp_self_mul), 2)

        FM_premise_attns = tf.expand_dims(
            blocks.factorize_machine(tf.concat([premise_bi, premise_attns],
                                               2)), 2)
        FM_prem_diff = tf.expand_dims(blocks.factorize_machine(prem_diff), 2)
        FM_prem_mul = tf.expand_dims(blocks.factorize_machine(prem_mul), 2)

        FM_hypothesis_attns = tf.expand_dims(
            blocks.factorize_machine(
                tf.concat([hypothesis_bi, hypothesis_attns], 2)), 2)
        FM_hyp_diff = tf.expand_dims(blocks.factorize_machine(hyp_diff), 2)
        FM_hyp_mul = tf.expand_dims(blocks.factorize_machine(hyp_mul), 2)

        m_a = tf.concat([
            premise_bi, FM_premise_attns, FM_prem_diff, FM_prem_mul,
            FM_premise_self_attns, FM_prem_self_diff, FM_prem_self_mul,
            r_alphas
        ], 2)
        m_b = tf.concat([
            hypothesis_bi, FM_hypothesis_attns, FM_hyp_diff, FM_hyp_mul,
            FM_hypothesis_self_attns, FM_hyp_self_diff, FM_hyp_self_mul,
            r_betas
        ], 2)

        ### Inference Composition ###

        v1_outs, c3 = blocks.biLSTM(m_a,
                                    dim=self.dim,
                                    seq_len=prem_seq_lengths,
                                    name='v1')
        v2_outs, c4 = blocks.biLSTM(m_b,
                                    dim=self.dim,
                                    seq_len=hyp_seq_lengths,
                                    name='v2')

        v1_bi = tf.concat(v1_outs, axis=2)
        v2_bi = tf.concat(v2_outs, axis=2)

        ### Pooling Layer ###

        v_1_sum = tf.reduce_sum(v1_bi, 1)
        v_1_ave = tf.div(
            v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1))

        v_2_sum = tf.reduce_sum(v2_bi, 1)
        v_2_ave = tf.div(
            v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1))

        v_1_max = tf.reduce_max(v1_bi, 1)
        v_2_max = tf.reduce_max(v2_bi, 1)

        alpha_w = blocks.masked_softmax(blocks.dense(r_alphas, 1), mask_prem)
        a_w = tf.reduce_sum(tf.multiply(alpha_w, v1_bi), 1)

        beta_w = blocks.masked_softmax(blocks.dense(r_betas, 1), mask_hyp)
        b_w = tf.reduce_sum(tf.multiply(beta_w, v2_bi), 1)

        v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max, a_w, b_w], 1)

        # MLP layer
        h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp)

        # Dropout applied to classifier
        h_drop = tf.nn.dropout(h_mlp, self.keep_rate_ph)

        # Get prediction
        self.logits = tf.matmul(h_drop, self.W_cl) + self.b_cl

        # Define the cost function
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y,
                                                           logits=self.logits))
Exemplo n.º 6
0
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        ## note: embedding_dim and hidden_dim are both 300, used interchangeably
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 8, self.dim], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1))

        self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, mask_prem = blocks.length(self.premise_x)
        hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x)

        ### First biLSTM layer ###

        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        premise_outs, c1 = blocks.biLSTM(premise_in,
                                         dim=self.dim,
                                         seq_len=prem_seq_lengths,
                                         name='premise')
        hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in,
                                            dim=self.dim,
                                            seq_len=hyp_seq_lengths,
                                            name='hypothesis')

        premise_bi = tf.concat(premise_outs, axis=2)
        hypothesis_bi = tf.concat(hypothesis_outs, axis=2)

        premise_list = tf.unstack(premise_bi, axis=1)
        hypothesis_list = tf.unstack(hypothesis_bi, axis=1)

        ### Attention ###

        scores_all = []
        premise_attn = []
        alphas = []

        for i in range(self.sequence_length):

            scores_i_list = []
            for j in range(self.sequence_length):
                score_ij = tf.reduce_sum(tf.multiply(premise_list[i],
                                                     hypothesis_list[j]),
                                         1,
                                         keep_dims=True)
                scores_i_list.append(score_ij)

            scores_i = tf.stack(scores_i_list, axis=1)
            alpha_i = blocks.masked_softmax(scores_i, mask_hyp)
            a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1)
            premise_attn.append(a_tilde_i)

            scores_all.append(scores_i)
            alphas.append(alpha_i)

        scores_stack = tf.stack(scores_all, axis=2)
        scores_list = tf.unstack(scores_stack, axis=1)

        hypothesis_attn = []
        betas = []
        for j in range(self.sequence_length):
            scores_j = scores_list[j]
            beta_j = blocks.masked_softmax(scores_j, mask_prem)
            b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1)
            hypothesis_attn.append(b_tilde_j)

            betas.append(beta_j)

        # Make attention-weighted sentence representations into one tensor,
        premise_attns = tf.stack(premise_attn, axis=1)
        hypothesis_attns = tf.stack(hypothesis_attn, axis=1)

        # For making attention plots,
        self.alpha_s = tf.stack(alphas, axis=2)
        self.beta_s = tf.stack(betas, axis=2)

        ### Subcomponent Inference ###

        prem_diff = tf.subtract(premise_bi, premise_attns)
        prem_mul = tf.multiply(premise_bi, premise_attns)
        hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns)
        hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns)

        m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2)
        m_b = tf.concat([hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul],
                        2)

        ### Inference Composition ###

        v1_outs, c3 = blocks.biLSTM(m_a,
                                    dim=self.dim,
                                    seq_len=prem_seq_lengths,
                                    name='v1')
        v2_outs, c4 = blocks.biLSTM(m_b,
                                    dim=self.dim,
                                    seq_len=hyp_seq_lengths,
                                    name='v2')

        v1_bi = tf.concat(v1_outs, axis=2)
        v2_bi = tf.concat(v2_outs, axis=2)

        ### Pooling Layer ###

        v_1_sum = tf.reduce_sum(v1_bi, 1)
        v_1_ave = tf.div(
            v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1))

        v_2_sum = tf.reduce_sum(v2_bi, 1)
        v_2_ave = tf.div(
            v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1))

        v_1_max = tf.reduce_max(v1_bi, 1)
        v_2_max = tf.reduce_max(v2_bi, 1)

        v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1)

        # MLP layer
        h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp)

        ############### MY CODE STARTS #####

        # Define layer size
        self.bow_layer_size = 300

        # LSTM layer (final layer of the original esim model)
        h_fc1 = h_mlp
        h_fc1 = tf.zeros_like(h_mlp)  # Don't need esim output

        # Bag-of-word input (averaing word embeddings)
        bow_pre = premise_in
        bow_hyp = hypothesis_in
        bag_of_word_in = tf.reduce_mean(tf.concat([bow_pre, bow_hyp], 1), 1)

        # Bag-of-word input layer params
        W_fc2 = tf.Variable(
            tf.random_normal([self.dim, self.bow_layer_size], stddev=0.1))
        b_fc2 = tf.Variable(tf.zeros([self.bow_layer_size]))
        h_fc2 = tf.nn.relu(tf.matmul(bag_of_word_in, W_fc2) + b_fc2)

        # Bag-of-word output layer params
        self.W_cl = tf.Variable(
            tf.random_normal([self.dim + self.bow_layer_size, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        pad2 = tf.zeros_like(h_fc1, tf.float32)

        # Compute both cost and prediction using yconv_contact_H
        yconv_contact_H = tf.nn.dropout(tf.concat([pad2, h_fc2], 1),
                                        self.keep_rate_ph)
        y_conv_H = tf.matmul(yconv_contact_H, self.W_cl) + self.b_cl

        y_conv_pred = y_conv_H
        y_conv_loss = y_conv_H

        self.logits = y_conv_pred  # Prediction
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.y, logits=self.logits))  # Cost
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 8, self.dim], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1))

        self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, prem_mask = blocks.length(self.premise_x)
        hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x)

        ### BiLSTM layer ###
        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        premise_outs, c1 = blocks.biLSTM(premise_in,
                                         dim=self.dim,
                                         seq_len=prem_seq_lengths,
                                         name='premise')
        hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in,
                                            dim=self.dim,
                                            seq_len=hyp_seq_lengths,
                                            name='hypothesis')

        premise_bi = tf.concat(premise_outs, axis=2)
        hypothesis_bi = tf.concat(hypothesis_outs, axis=2)

        #premise_final = blocks.last_output(premise_bi, prem_seq_lengths)
        #hypothesis_final =  blocks.last_output(hypothesis_bi, hyp_seq_lengths)

        ### Mean pooling
        premise_sum = tf.reduce_sum(premise_bi, 1)
        premise_ave = tf.div(
            premise_sum,
            tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1))

        hypothesis_sum = tf.reduce_sum(hypothesis_bi, 1)
        hypothesis_ave = tf.div(
            hypothesis_sum,
            tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1))

        ### Mou et al. concat layer ###
        diff = tf.subtract(premise_ave, hypothesis_ave)
        mul = tf.multiply(premise_ave, hypothesis_ave)
        h = tf.concat([premise_ave, hypothesis_ave, diff, mul], 1)

        # MLP layer
        h_mlp = tf.nn.relu(tf.matmul(h, self.W_mlp) + self.b_mlp)
        # Dropout applied to classifier
        h_drop = tf.nn.dropout(h_mlp, self.keep_rate_ph)

        # Get prediction
        self.logits = tf.matmul(h_drop, self.W_cl) + self.b_cl

        # Define the cost function
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y,
                                                           logits=self.logits))
Exemplo n.º 8
0
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 8, self.dim], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1))

        self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        # Function for embedding lookup and dropout at embedding layer
        # dropout就是忽略部分特征检测器(让部分隐层节点值为0)
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, mask_prem = blocks.length(self.premise_x)
        hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x)

        # ————————————————————————input encoding阶段——————————————————————————————-

        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        # 通过BiLSTM重新学习单词和上下文的关系
        premise_outs, c1 = blocks.biLSTM(premise_in,
                                         dim=self.dim,
                                         seq_len=prem_seq_lengths,
                                         name='premise')
        hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in,
                                            dim=self.dim,
                                            seq_len=hyp_seq_lengths,
                                            name='hypothesis')
        print('premise_outs: ', premise_outs)

        premise_bi = tf.concat(premise_outs, axis=2)
        hypothesis_bi = tf.concat(hypothesis_outs, axis=2)

        premise_list = tf.unstack(premise_bi, axis=1)
        hypothesis_list = tf.unstack(hypothesis_bi, axis=1)
        print('hypothesis_list: ', hypothesis_list)

        # 注意力机制
        scores_all = []
        premise_attn = []
        alphas = []

        for i in range(self.sequence_length):

            scores_i_list = []
            for j in range(self.sequence_length):
                # 计算第一个句子(premise)的第i个单词和第二个句子所有单词的相似度(向量乘积)
                # 这里的score就是论文里面的e
                score_ij = tf.reduce_sum(tf.multiply(premise_list[i],
                                                     hypothesis_list[j]),
                                         1,
                                         keep_dims=True)
                scores_i_list.append(score_ij)

            scores_i = tf.stack(scores_i_list, axis=1)
            alpha_i = blocks.masked_softmax(scores_i,
                                            mask_hyp)  # 通过softmax标准化转换成权重
            a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi),
                                      1)  # 这里就是用句子b的各个词向量根据权重去表示句子a的第i个词向量
            premise_attn.append(a_tilde_i)

            scores_all.append(scores_i)
            alphas.append(alpha_i)

        # 把scores的结构转为list
        scores_stack = tf.stack(scores_all, axis=2)
        scores_list = tf.unstack(scores_stack, axis=1)

        # 对句子b也重复上面的过程
        hypothesis_attn = []
        betas = []
        for j in range(self.sequence_length):
            scores_j = scores_list[j]
            beta_j = blocks.masked_softmax(scores_j, mask_prem)
            b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1)
            hypothesis_attn.append(b_tilde_j)

            betas.append(beta_j)

        # Make attention-weighted sentence representations into one tensor,
        premise_attns = tf.stack(premise_attn, axis=1)
        hypothesis_attns = tf.stack(hypothesis_attn, axis=1)

        # For making attention plots,
        self.alpha_s = tf.stack(alphas, axis=2)
        self.beta_s = tf.stack(betas, axis=2)

        # Enhancement of local inference information
        # 下面就是分析差异的过程
        prem_diff = tf.subtract(premise_bi, premise_attns)
        prem_mul = tf.multiply(premise_bi, premise_attns)
        hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns)
        hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns)

        m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2)
        m_b = tf.concat([hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul],
                        2)

        # Inference Composition
        # 用BiLSTM分析overall inference relationship between a premise and hypothesis
        v1_outs, c3 = blocks.biLSTM(m_a,
                                    dim=self.dim,
                                    seq_len=prem_seq_lengths,
                                    name='v1')
        v2_outs, c4 = blocks.biLSTM(m_b,
                                    dim=self.dim,
                                    seq_len=hyp_seq_lengths,
                                    name='v2')

        v1_bi = tf.concat(v1_outs, axis=2)
        v2_bi = tf.concat(v2_outs, axis=2)

        # Pooling Layer
        v_1_sum = tf.reduce_sum(v1_bi, 1)
        v_1_ave = tf.div(
            v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1))

        v_2_sum = tf.reduce_sum(v2_bi, 1)
        v_2_ave = tf.div(
            v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1))

        v_1_max = tf.reduce_max(v1_bi, 1)
        v_2_max = tf.reduce_max(v2_bi, 1)

        v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1)

        # 最后用MLP layer做分类
        h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp)

        # Dropout applied to classifier
        h_drop = tf.nn.dropout(h_mlp, self.keep_rate_ph)

        # Get prediction
        self.logits = tf.matmul(h_drop, self.W_cl) + self.b_cl

        # Define the cost function
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y,
                                                           logits=self.logits))