Exemplo n.º 1
0
    def _multihead_attention_layer(self,
                                   layer_idx,
                                   query,
                                   memory=None,
                                   mask=None):
        if memory is None:
            memory = query

        # Linear project to d_model dimension: [batch, q_size/k_size, d_model]
        Q = ne.fully_conn(query,
                          self.att_weights["W_att_Q_l{}_0".format(layer_idx)],
                          self.att_biases["b_att_Q_l{}_0".format(layer_idx)])
        Q = ne.leaky_relu(Q, self.leaky_ratio[layer_idx])

        K = ne.fully_conn(memory,
                          self.att_weights["W_att_K_l{}_0".format(layer_idx)],
                          self.att_biases["b_att_K_l{}_0".format(layer_idx)])
        K = ne.leaky_relu(K, self.leaky_ratio[layer_idx])

        V = ne.fully_conn(memory,
                          self.att_weights["W_att_V_l{}_0".format(layer_idx)],
                          self.att_biases["b_att_V_l{}_0".format(layer_idx)])
        V = ne.leaky_relu(V, self.leaky_ratio[layer_idx])

        # Split the matrix to multiple heads and then concatenate to have a larger
        # batch size: [h*batch, q_size/k_size, d_model/num_heads]
        Q_split = tf.concat(tf.split(Q, self.num_att_header, axis=2), axis=0)
        K_split = tf.concat(tf.split(K, self.num_att_header, axis=2), axis=0)
        V_split = tf.concat(tf.split(V, self.num_att_header, axis=2), axis=0)
        if mask != None:
            mask = tf.tile(mask, [self.num_att_header, 1, 1])

        # Apply scaled dot product attention
        d = self.feature_size // self.num_att_header
        assert d == Q_split.shape[-1] == K_split.shape[-1] == V_split.shape[-1]

        out = tf.matmul(Q_split,
                        tf.transpose(K_split,
                                     [0, 2, 1]))  # [h*batch, q_size, k_size]
        out = out / tf.sqrt(tf.cast(d, tf.float32))  # scaled by sqrt(d_k)

        if mask is not None:
            # masking out (0.0) => setting to -inf.
            out = tf.multiply(out, mask) + (1.0 - mask) * (-1e10)

        out = ne.softmax(out)  # [h * batch, q_size, k_size]
        out = ne.dropout(out, self.drop_rate[layer_idx], self.is_training)
        out = tf.matmul(out, V_split)  # [h * batch, q_size, d_model]

        # Merge the multi-head back to the original shape
        out = tf.concat(tf.split(out, self.num_att_header, axis=0),
                        axis=2)  # [bs, q_size, d_model]

        return out
Exemplo n.º 2
0
    def in_layer(self, inputs, W_name="W_in_", b_name="b_in_"):
        net = inputs
        # fc
        h, w, c = net.get_shape().as_list()[1:]
        assert h * w * c == self.in_state
        net = tf.reshape(net, [-1, self.in_state])

        for layer_id in range(len(self.in_fc_states)):
            weight_name = "{}{}".format(W_name, layer_id)
            bias_name = "{}{}".format(b_name, layer_id)
            curr_weight = self.in_weight[weight_name]
            curr_bias = self.in_bias[bias_name]

            # batch normalization
            if self.in_norm == "BATCH":
                net = ne.batch_norm(net, self.is_training)
            elif self.in_norm == "LAYER":
                net = ne.layer_norm(net, self.is_training)
            #net = ne.leaky_brelu(net, self.conv_leaky_ratio[layer_id], self.layer_low_bound, self.output_up_bound) # Nonlinear act
            net = ne.leaky_relu(net, self.in_leaky_ratio)
            net = ne.fully_conn(net, curr_weight, curr_bias)

        out_channel = self.in_fc_states[-1] // h // w
        assert h * w * out_channel == self.in_fc_states[-1]
        net = tf.reshape(net, [-1, h, w, out_channel])
        net = tf.identity(net, name='in_output')
        #import pdb; pdb.set_trace()
        return net
Exemplo n.º 3
0
    def out_layer(self, inputs, label=None, W_name="W_out_", b_name="b_out_"):
        net = inputs
        h, w, c = net.get_shape().as_list()[1:]
        assert h*w*c == self.out_state
        net = tf.reshape(net, [-1, self.out_state])
        if self.use_class_label:
            net = tf.concat([net, label], -1)
        
        for layer_id in range(len(self.out_fc_states)):
            weight_name = "{}{}".format(W_name, layer_id)
            bias_name = "{}{}".format(b_name, layer_id)
            curr_weight = self.out_weight[weight_name]
            curr_bias = self.out_bias[bias_name]

            net = ne.fully_conn(net, curr_weight, curr_bias)
            if self.out_norm == "BATCH":
                net = ne.batch_norm(net, self.is_training)
            elif self.out_norm == "LAYER":
                net = ne.layer_norm(net, self.is_training)
            #net = ne.leaky_brelu(net, self.conv_leaky_ratio[layer_id], self.layer_low_bound, self.output_up_bound) # Nonlinear act
            net = ne.leaky_relu(net, self.out_leaky_ratio)

        out_channel_size = self.out_fc_states[-1]//h//w
        assert h*w*out_channel_size == self.out_fc_states[-1]
        net = tf.reshape(net, [-1, h, w, out_channel_size])
        #net = ne.max_pool_2x2(net) # Pooling
        net = tf.identity(net, name='out_output')
        #import pdb; pdb.set_trace()
        return net
Exemplo n.º 4
0
 def _func(net, layer_id, postfix="", act_func="leaky"):
     weight_name = "{}{}{}".format(W_name, layer_id, postfix)
     bias_name = "{}{}{}".format(b_name, layer_id, postfix)
     curr_weight = self.enfc_weights[weight_name]
     curr_bias = self.enfc_biases[bias_name]
     net = ne.fully_conn(net, weights=curr_weight, biases=curr_bias)
     # batch normalization
     if self.use_norm == "BATCH":
         net = ne.batch_norm(net, self.is_training, axis=1)
     elif self.use_norm == "LAYER":
         net = ne.layer_norm(net, self.is_training)
     #net = ne.leaky_brelu(net, self.enfc_leaky_ratio[layer_id], self.enfc_low_bound[layer_id], self.enfc_up_bound[layer_id]) # Nonlinear act
     if act_func == "leaky":
         net = ne.leaky_relu(net, self.enfc_leaky_ratio[layer_id])
     elif act_func == "soft":
         net = tf.nn.softplus(net)
     #net = ne.drop_out(net, self.enfc_drop_rate[layer_id], self.is_training)
     return net
Exemplo n.º 5
0
    def _fc_layers(self, inputs, weights_dict, biases_dict, fc_leaky_ratio,
                   fc_drop_rate, num_fc, W_name, b_name):
        net = inputs
        for layer_id in range(num_fc):
            weight_name = "{}{}".format(W_name, layer_id)
            bias_name = "{}{}".format(b_name, layer_id)
            curr_weight = weights_dict[weight_name]
            curr_bias = biases_dict[bias_name]
            net = ne.fully_conn(net, weights=curr_weight, biases=curr_bias)
            # batch normalization
            if self.use_norm == "BATCH":
                net = ne.batch_norm(net, self.is_training, axis=-1)
            #net = ne.leaky_brelu(net, self.enfc_leaky_ratio[layer_id], self.enfc_low_bound[layer_id], self.enfc_up_bound[layer_id]) # Nonlinear act
            net = ne.leaky_relu(net, fc_leaky_ratio[layer_id])
            net = ne.drop_out(net, fc_drop_rate[layer_id], self.is_training)
            #net = ne.elu(net)

        net = tf.identity(net, name='output')
        return net
Exemplo n.º 6
0
    def defc_layers(self, inputs, W_name="W_defc", b_name="b_defc"):
        net = inputs
        for layer_id in range(self.num_enfc):
            weight_name = "{}{}".format(W_name, layer_id)
            bias_name = "{}{}".format(b_name, layer_id)
            curr_weight = self.defc_weights[weight_name]
            curr_bias = self.defc_biases[bias_name]
            net = ne.fully_conn(net, weights=curr_weight, biases=curr_bias)
            # batch normalization
            if self.use_batch_norm:
                net = ne.batch_norm(net, self.is_training, axis=1)

            #net = ne.leaky_brelu(net, self.defc_leaky_ratio[layer_id], self.layer_low_bound, self.layer_up_bound) # Nonlinear act
            net = ne.leaky_relu(net, self.defc_leaky_ratio[layer_id])
            net = ne.drop_out(net, self.defc_drop_rate[layer_id],
                              self.is_training)
            #net = ne.elu(net)

        net = tf.identity(net, name='output')
        net = tf.reshape(net, [-1] + self.decv_in_shape)
        return net
Exemplo n.º 7
0
    def enfc_layers(self, inputs, W_name="W_enfc", b_name="b_enfc"):
        net = tf.reshape(inputs, [
            -1, self.conv_out_shape[0] * self.conv_out_shape[1] *
            self.conv_out_shape[2]
        ])
        for layer_id in range(self.num_enfc):
            weight_name = "{}{}".format(W_name, layer_id)
            bias_name = "{}{}".format(b_name, layer_id)
            curr_weight = self.enfc_weights[weight_name]
            curr_bias = self.enfc_biases[bias_name]
            net = ne.fully_conn(net, weights=curr_weight, biases=curr_bias)
            # batch normalization
            if self.use_norm == "BATCH":
                net = ne.batch_norm(net, self.is_training, axis=1)
            elif self.use_norm == "LAYER":
                net = ne.layer_norm(net, self.is_training)
            #net = ne.leaky_brelu(net, self.enfc_leaky_ratio[layer_id], self.enfc_low_bound[layer_id], self.enfc_up_bound[layer_id]) # Nonlinear act
            net = ne.leaky_relu(net, self.enfc_leaky_ratio[layer_id])
            net = ne.drop_out(net, self.enfc_drop_rate[layer_id],
                              self.is_training)
            #net = ne.elu(net)

        net = tf.identity(net, name='output')
        return net