예제 #1
0
    def __init__(
        self,
        input_dim,
        emb_dim,
        emb_freeze,
        d_model,
        pad_idx,
        dropout,
        embeddings=None,
        pretrain_feature_model=None,
    ):
        super().__init__()
        self.emb_dim = emb_dim
        self.d_model = d_model

        self.pretrain_feature = pretrain_feature_model is not None
        if self.pretrain_feature:
            self.pos_encoder = utils.PositionalEncoding(emb_dim * 2)
            if self.emb_dim != self.d_model:
                self.proj = nn.Linear(emb_dim * 2, d_model)
        else:
            self.pos_encoder = utils.PositionalEncoding(emb_dim)
            if self.emb_dim != self.d_model:
                self.proj = nn.Linear(emb_dim, d_model)
        self.dropout = nn.Dropout(dropout)
        if self.pretrain_feature:
            self.emb1 = pretrain_feature_model
        self.emb = utils.embedding(input_dim, emb_dim, embeddings, emb_freeze,
                                   pad_idx)
예제 #2
0
    def encoder_impl(self, encoder_input, is_training):

        attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0
        residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0

        # Mask
        encoder_padding = tf.equal(encoder_input, 0)
        # Embedding
        encoder_output = embedding(encoder_input,
                                   vocab_size=self._config.src_vocab_size,
                                   dense_size=self._config.hidden_units,
                                   multiplier=self._config.hidden_units**0.5
                                   if self._config.scale_embedding else 1.0,
                                   name="src_embedding")
        # Add positional signal
        encoder_output = common_attention.add_timing_signal_1d(encoder_output)
        # Dropout
        encoder_output = tf.layers.dropout(
            encoder_output,
            rate=self._config.residual_dropout_rate,
            training=is_training)

        # Blocks
        for i in range(self._config.num_blocks):
            with tf.variable_scope("block_{}".format(i)):
                # Multihead Attention
                encoder_output = residual(
                    encoder_output,
                    multihead_attention(
                        query_antecedent=encoder_output,
                        memory_antecedent=None,
                        bias=common_attention.attention_bias_ignore_padding(
                            encoder_padding),
                        total_key_depth=self._config.hidden_units,
                        total_value_depth=self._config.hidden_units,
                        output_depth=self._config.hidden_units,
                        num_heads=self._config.num_heads,
                        dropout_rate=attention_dropout_rate,
                        name='encoder_self_attention',
                        summaries=True),
                    dropout_rate=self._config.residual_dropout_rate)

                # Feed Forward
                encoder_output = residual(
                    encoder_output,
                    common_layers.conv_hidden_relu(
                        inputs=encoder_output,
                        hidden_size=4 * self._config.hidden_units,
                        output_size=self._config.hidden_units,
                        summaries=True),
                    dropout_rate=residual_dropout_rate)
        # Mask padding part to zeros.
        encoder_output *= tf.expand_dims(1.0 - tf.to_float(encoder_padding),
                                         axis=-1)
        return encoder_output
예제 #3
0
    def __init__(
        self,
        input_dim,
        emb_dim, 
        emb_freeze, 
        pad_idx,
        embeddings=None 
    ):
        super().__init__()
        self.emb_dim = emb_dim

        self.emb = utils.embedding(input_dim, emb_dim, embeddings, emb_freeze, pad_idx)
예제 #4
0
    def __init__(
        self,
        input_dim,
        emb_dim, 
        emb_freeze, 
        pad_idx,
        dropout,
        embeddings=None 
    ):
        super().__init__()
        self.emb_dim = emb_dim

        self.emb = utils.embedding(input_dim, emb_dim, embeddings, emb_freeze, pad_idx)
        self.pos_encoder = utils.PositionalEncoding(emb_dim)
        self.dropout = nn.Dropout(dropout)
예제 #5
0
    def __init__(
        self,
        sep_idx,
        spe1_idx,
        spe2_idx,
        input_dim,
        emb_dim,
        emb_freeze,
        d_model,
        pad_idx,
        dropout,
        persona_vocab_size,
        use_mem_n2n,
        embeddings=None,
        pretrain_feature_model=None,
    ):
        super().__init__()
        self.sep_idx = sep_idx
        self.spe1_idx = spe1_idx
        self.spe2_idx = spe2_idx
        self.emb_dim = emb_dim
        self.input_dim = input_dim
        self.d_model = d_model

        self.pretrain_feature = pretrain_feature_model is not None
        if self.pretrain_feature:
            self.pos_encoder = utils.PositionalEncoding(emb_dim * 2)
            if self.emb_dim != self.d_model:
                self.proj = nn.Linear(emb_dim * 2, d_model)
        else:
            self.pos_encoder = utils.PositionalEncoding(emb_dim)
            # self.pos_encoder = nn.Embedding(512, emb_dim)
            if self.emb_dim != self.d_model:
                self.proj = nn.Linear(emb_dim, d_model)
        self.dropout = nn.Dropout(dropout)
        if self.pretrain_feature:
            self.emb1 = pretrain_feature_model
        self.emb = utils.embedding(input_dim, emb_dim, embeddings, emb_freeze,
                                   pad_idx)
        self.persona_emb = self.emb
        if use_mem_n2n:
            self.persona_emb = nn.Embedding(persona_vocab_size, emb_dim)
예제 #6
0
    def decoder_with_caching_impl(self, decoder_input, decoder_cache,
                                  encoder_output, is_training):
        # decoder_input: [batch_size * beam_size, step], 该step逐步增加,即1,2,3,..
        # decoder_cache: [batch_size * beam_size, 0, num_blocks , hidden_units ]
        # encoder_output: [batch_size * beam_size, time_step, hidden_units]
        attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0
        residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0

        encoder_padding = tf.equal(
            tf.reduce_sum(tf.abs(encoder_output), axis=-1), 0.0)
        encoder_attention_bias = common_attention.attention_bias_ignore_padding(
            encoder_padding)

        decoder_output = embedding(decoder_input,
                                   vocab_size=self._config.dst_vocab_size,
                                   dense_size=self._config.hidden_units,
                                   multiplier=self._config.hidden_units**0.5
                                   if self._config.scale_embedding else 1.0,
                                   name="dst_embedding")
        # Positional Encoding
        decoder_output += common_attention.add_timing_signal_1d(decoder_output)
        # Dropout
        decoder_output = tf.layers.dropout(decoder_output,
                                           rate=residual_dropout_rate,
                                           training=is_training)

        new_cache = []

        # Blocks
        for i in range(self._config.num_blocks):
            with tf.variable_scope("block_{}".format(i)):
                # Multihead Attention (self-attention)
                decoder_output = residual(
                    decoder_output[:, -1:, :],
                    multihead_attention(
                        query_antecedent=decoder_output,
                        memory_antecedent=None,
                        bias=None,
                        total_key_depth=self._config.hidden_units,
                        total_value_depth=self._config.hidden_units,
                        num_heads=self._config.num_heads,
                        dropout_rate=attention_dropout_rate,
                        reserve_last=True,
                        output_depth=self._config.hidden_units,
                        name="decoder_self_attention",
                        summaries=True),
                    dropout_rate=residual_dropout_rate)

                # Multihead Attention (vanilla attention)
                decoder_output = residual(
                    decoder_output,
                    multihead_attention(
                        query_antecedent=decoder_output,
                        memory_antecedent=encoder_output,
                        bias=encoder_attention_bias,
                        total_key_depth=self._config.hidden_units,
                        total_value_depth=self._config.hidden_units,
                        output_depth=self._config.hidden_units,
                        num_heads=self._config.num_heads,
                        dropout_rate=attention_dropout_rate,
                        reserve_last=True,
                        name="decoder_vanilla_attention",
                        summaries=True),
                    dropout_rate=residual_dropout_rate)

                # Feed Forward
                decoder_output = residual(
                    decoder_output,
                    ff_hidden(decoder_output,
                              hidden_size=4 * self._config.hidden_units,
                              output_size=self._config.hidden_units,
                              activation=self._ff_activation),
                    dropout_rate=residual_dropout_rate)

                decoder_output = tf.concat(
                    [decoder_cache[:, :, i, :], decoder_output], axis=1)
                new_cache.append(decoder_output[:, :, None, :])

        new_cache = tf.concat(
            new_cache, axis=2)  # [batch_size, n_step, num_blocks, num_hidden]

        return decoder_output, new_cache
예제 #7
0
    def decoder_impl(self, decoder_input, encoder_output, is_training):
        # decoder_input: [batch_size, step]
        # encoder_output: [batch_size, time_step, hidden_units]
        attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0
        residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0

        encoder_padding = tf.equal(
            tf.reduce_sum(tf.abs(encoder_output), axis=-1), 0.0)
        encoder_attention_bias = common_attention.attention_bias_ignore_padding(
            encoder_padding)

        decoder_output = embedding(decoder_input,
                                   vocab_size=self._config.dst_vocab_size,
                                   dense_size=self._config.hidden_units,
                                   multiplier=self._config.hidden_units**0.5
                                   if self._config.scale_embedding else 1.0,
                                   name="dst_embedding")
        # Positional Encoding
        decoder_output += common_attention.add_timing_signal_1d(decoder_output)
        # Dropout
        decoder_output = tf.layers.dropout(decoder_output,
                                           rate=residual_dropout_rate,
                                           training=is_training)
        # Bias for preventing peeping later information
        self_attention_bias = common_attention.attention_bias_lower_triangle(
            tf.shape(decoder_input)[1])

        # Blocks
        for i in range(self._config.num_blocks_dec):
            with tf.variable_scope("block_{}".format(i)):
                # Multihead Attention (self-attention)
                decoder_output = residual(
                    decoder_output,
                    multihead_attention(
                        query_antecedent=decoder_output,
                        memory_antecedent=None,
                        bias=self_attention_bias,
                        total_key_depth=self._config.hidden_units,
                        total_value_depth=self._config.hidden_units,
                        num_heads=self._config.num_heads,
                        dropout_rate=attention_dropout_rate,
                        output_depth=self._config.hidden_units,
                        name="decoder_self_attention",
                        summaries=True),
                    dropout_rate=residual_dropout_rate)

                # Multihead Attention (vanilla attention)
                decoder_output = residual(
                    decoder_output,
                    multihead_attention(
                        query_antecedent=decoder_output,
                        memory_antecedent=encoder_output,
                        bias=encoder_attention_bias,
                        total_key_depth=self._config.hidden_units,
                        total_value_depth=self._config.hidden_units,
                        output_depth=self._config.hidden_units,
                        num_heads=self._config.num_heads,
                        dropout_rate=attention_dropout_rate,
                        name="decoder_vanilla_attention",
                        summaries=True),
                    dropout_rate=residual_dropout_rate)

                # Feed Forward
                decoder_output = residual(
                    decoder_output,
                    ff_hidden(decoder_output,
                              hidden_size=4 * self._config.hidden_units,
                              output_size=self._config.hidden_units,
                              activation=self._ff_activation),
                    dropout_rate=residual_dropout_rate)
        return decoder_output
예제 #8
0
 def tck2emb(self, streamlines: typing.List) -> np.array:
     return embedding(self.net, streamlines)
예제 #9
0
    def build_network(self):
        #import ipdb; ipdb.set_trace()
        config = self.config
        de2idx, idx2de = load_de_vocab()
        en2idx, idx2en = load_en_vocab()

        # Encoder
        with tf.variable_scope("encoder"):
            ## Embedding
            self.enc = embedding(self.x,
                                 len(de2idx),
                                 num_units=config.hidden_dim,
                                 scale=True,
                                 scope='enc_embed')

            ## plus position embedding
            self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), \
                                            [tf.shape(self.x)[0], 1]),
                                config.maxlen,
                                config.hidden_dim,
                                zero_pad=False,
                                scale=False,
                                scope="enc_pe")

            self.enc = dropout(self.enc,
                               config.keep_rate,
                               is_train=self.is_train)

            self.enc_ = self.enc
            for block_idx in range(config.num_enc_block_1):
                scope = "encoder_block_{}".format(block_idx)
                enc_out = conv2d(self.enc,
                                 kernel_shape=(config.enc_kernel_width, 1),
                                 scope=scope)
                enc_out = batch_norm(enc_out,
                                     is_training=self.is_train,
                                     scope="lm" + scope)
                self.enc = enc_out

        # Decoder
        with tf.variable_scope("decoder"):
            ## Embedding
            self.dec = embedding(self.decode_input,
                                 len(en2idx),
                                 config.hidden_dim,
                                 scale=True,
                                 scope='dec_embed')
            ## plus position embedding
            self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decode_input)[1]), 0), \
                                            [tf.shape(self.decode_input)[0], 1]),
                                config.maxlen,
                                config.hidden_dim,
                                zero_pad=False,
                                scale=False,
                                scope='dec_pe')

            self.dec_ = self.dec
            for block_idx in range(config.num_dec_block_1):
                scope = "decoder_block_conv_{}".format(block_idx)
                attention_scope = "decoder_block_att_{}".format(block_idx)
                dec_out = conv2d(self.dec,
                                 kernel_shape=(config.dec_kernel_width, 1),
                                 causal=True,
                                 scope=scope)
                dec_out = attention_pool(self.enc_,
                                         self.dec,
                                         enc_out,
                                         dec_out,
                                         scope=attention_scope)
                dec_out = dec_out + self.dec
                dec_out = batch_norm(dec_out,
                                     is_training=self.is_train,
                                     scope="lm" + scope)
                self.dec = dec_out

        with tf.variable_scope('encoder'):
            for block_idx in range(config.num_enc_block_2):
                scope = "encoder_block_{}".format(config.num_enc_block_1 +
                                                  block_idx)
                enc_out = conv2d(self.enc,
                                 kernel_shape=(config.enc_kernel_width, 1),
                                 num_outputs=config.hidden_dim_2,
                                 scope=scope)
                enc_out = batch_norm(enc_out,
                                     is_training=self.is_train,
                                     scope="lm" + scope)
                self.enc = enc_out

        with tf.variable_scope('decoder'):
            for block_idx in range(config.num_dec_block_2):
                scope = "decoder_block_conv_{}".format(config.num_dec_block_1 +
                                                       block_idx)
                attention_scope = "decoder_block_att_{}".format(
                    config.num_dec_block_1 + block_idx)
                dec_out = conv2d(self.dec,
                                 kernel_shape=(config.dec_kernel_width, 1),
                                 num_outputs=config.hidden_dim_2,
                                 causal=True,
                                 scope=scope)
                dec_out = attention_pool(self.enc_,
                                         self.dec,
                                         enc_out,
                                         dec_out,
                                         scope=attention_scope)
                dec_out = dec_out + self.dec
                dec_out = batch_norm(dec_out,
                                     is_training=self.is_train,
                                     scope="lm" + scope)
                self.dec = dec_out

        with tf.variable_scope("softmax_layer"):
            w = tf.get_variable('w', [config.hidden_dim, len(en2idx)])
            b = tf.get_variable('b', [len(en2idx)])
            w = tf.tile(tf.expand_dims(w, 0), [config.batch_size, 1, 1])
            self.logits = tf.matmul(dec_out, w) + b
            self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) *
                self.istarget) / tf.reduce_sum(self.istarget)
            tf.summary.scalar('acc', self.acc)

            if self.is_train:
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_mean(self.loss)
                tf.summary.scalar('mean_loss', self.mean_loss)

        self.tensors = {
            'source_sentence': self.enc_,
            'target_sentence': self.dec_,
            'enc_out': enc_out,
            'dec_out': dec_out,
            'predictions': self.preds,
            'logits': self.logits
        }
        if self.is_train:
            self.tensors['loss'] = self.loss

        for key, value in self.tensors.items():
            tf.summary.histogram(key, value)