def __init__( self, input_dim, emb_dim, emb_freeze, d_model, pad_idx, dropout, embeddings=None, pretrain_feature_model=None, ): super().__init__() self.emb_dim = emb_dim self.d_model = d_model self.pretrain_feature = pretrain_feature_model is not None if self.pretrain_feature: self.pos_encoder = utils.PositionalEncoding(emb_dim * 2) if self.emb_dim != self.d_model: self.proj = nn.Linear(emb_dim * 2, d_model) else: self.pos_encoder = utils.PositionalEncoding(emb_dim) if self.emb_dim != self.d_model: self.proj = nn.Linear(emb_dim, d_model) self.dropout = nn.Dropout(dropout) if self.pretrain_feature: self.emb1 = pretrain_feature_model self.emb = utils.embedding(input_dim, emb_dim, embeddings, emb_freeze, pad_idx)
def encoder_impl(self, encoder_input, is_training): attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0 residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0 # Mask encoder_padding = tf.equal(encoder_input, 0) # Embedding encoder_output = embedding(encoder_input, vocab_size=self._config.src_vocab_size, dense_size=self._config.hidden_units, multiplier=self._config.hidden_units**0.5 if self._config.scale_embedding else 1.0, name="src_embedding") # Add positional signal encoder_output = common_attention.add_timing_signal_1d(encoder_output) # Dropout encoder_output = tf.layers.dropout( encoder_output, rate=self._config.residual_dropout_rate, training=is_training) # Blocks for i in range(self._config.num_blocks): with tf.variable_scope("block_{}".format(i)): # Multihead Attention encoder_output = residual( encoder_output, multihead_attention( query_antecedent=encoder_output, memory_antecedent=None, bias=common_attention.attention_bias_ignore_padding( encoder_padding), total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, output_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, name='encoder_self_attention', summaries=True), dropout_rate=self._config.residual_dropout_rate) # Feed Forward encoder_output = residual( encoder_output, common_layers.conv_hidden_relu( inputs=encoder_output, hidden_size=4 * self._config.hidden_units, output_size=self._config.hidden_units, summaries=True), dropout_rate=residual_dropout_rate) # Mask padding part to zeros. encoder_output *= tf.expand_dims(1.0 - tf.to_float(encoder_padding), axis=-1) return encoder_output
def __init__( self, input_dim, emb_dim, emb_freeze, pad_idx, embeddings=None ): super().__init__() self.emb_dim = emb_dim self.emb = utils.embedding(input_dim, emb_dim, embeddings, emb_freeze, pad_idx)
def __init__( self, input_dim, emb_dim, emb_freeze, pad_idx, dropout, embeddings=None ): super().__init__() self.emb_dim = emb_dim self.emb = utils.embedding(input_dim, emb_dim, embeddings, emb_freeze, pad_idx) self.pos_encoder = utils.PositionalEncoding(emb_dim) self.dropout = nn.Dropout(dropout)
def __init__( self, sep_idx, spe1_idx, spe2_idx, input_dim, emb_dim, emb_freeze, d_model, pad_idx, dropout, persona_vocab_size, use_mem_n2n, embeddings=None, pretrain_feature_model=None, ): super().__init__() self.sep_idx = sep_idx self.spe1_idx = spe1_idx self.spe2_idx = spe2_idx self.emb_dim = emb_dim self.input_dim = input_dim self.d_model = d_model self.pretrain_feature = pretrain_feature_model is not None if self.pretrain_feature: self.pos_encoder = utils.PositionalEncoding(emb_dim * 2) if self.emb_dim != self.d_model: self.proj = nn.Linear(emb_dim * 2, d_model) else: self.pos_encoder = utils.PositionalEncoding(emb_dim) # self.pos_encoder = nn.Embedding(512, emb_dim) if self.emb_dim != self.d_model: self.proj = nn.Linear(emb_dim, d_model) self.dropout = nn.Dropout(dropout) if self.pretrain_feature: self.emb1 = pretrain_feature_model self.emb = utils.embedding(input_dim, emb_dim, embeddings, emb_freeze, pad_idx) self.persona_emb = self.emb if use_mem_n2n: self.persona_emb = nn.Embedding(persona_vocab_size, emb_dim)
def decoder_with_caching_impl(self, decoder_input, decoder_cache, encoder_output, is_training): # decoder_input: [batch_size * beam_size, step], 该step逐步增加,即1,2,3,.. # decoder_cache: [batch_size * beam_size, 0, num_blocks , hidden_units ] # encoder_output: [batch_size * beam_size, time_step, hidden_units] attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0 residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0 encoder_padding = tf.equal( tf.reduce_sum(tf.abs(encoder_output), axis=-1), 0.0) encoder_attention_bias = common_attention.attention_bias_ignore_padding( encoder_padding) decoder_output = embedding(decoder_input, vocab_size=self._config.dst_vocab_size, dense_size=self._config.hidden_units, multiplier=self._config.hidden_units**0.5 if self._config.scale_embedding else 1.0, name="dst_embedding") # Positional Encoding decoder_output += common_attention.add_timing_signal_1d(decoder_output) # Dropout decoder_output = tf.layers.dropout(decoder_output, rate=residual_dropout_rate, training=is_training) new_cache = [] # Blocks for i in range(self._config.num_blocks): with tf.variable_scope("block_{}".format(i)): # Multihead Attention (self-attention) decoder_output = residual( decoder_output[:, -1:, :], multihead_attention( query_antecedent=decoder_output, memory_antecedent=None, bias=None, total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, reserve_last=True, output_depth=self._config.hidden_units, name="decoder_self_attention", summaries=True), dropout_rate=residual_dropout_rate) # Multihead Attention (vanilla attention) decoder_output = residual( decoder_output, multihead_attention( query_antecedent=decoder_output, memory_antecedent=encoder_output, bias=encoder_attention_bias, total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, output_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, reserve_last=True, name="decoder_vanilla_attention", summaries=True), dropout_rate=residual_dropout_rate) # Feed Forward decoder_output = residual( decoder_output, ff_hidden(decoder_output, hidden_size=4 * self._config.hidden_units, output_size=self._config.hidden_units, activation=self._ff_activation), dropout_rate=residual_dropout_rate) decoder_output = tf.concat( [decoder_cache[:, :, i, :], decoder_output], axis=1) new_cache.append(decoder_output[:, :, None, :]) new_cache = tf.concat( new_cache, axis=2) # [batch_size, n_step, num_blocks, num_hidden] return decoder_output, new_cache
def decoder_impl(self, decoder_input, encoder_output, is_training): # decoder_input: [batch_size, step] # encoder_output: [batch_size, time_step, hidden_units] attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0 residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0 encoder_padding = tf.equal( tf.reduce_sum(tf.abs(encoder_output), axis=-1), 0.0) encoder_attention_bias = common_attention.attention_bias_ignore_padding( encoder_padding) decoder_output = embedding(decoder_input, vocab_size=self._config.dst_vocab_size, dense_size=self._config.hidden_units, multiplier=self._config.hidden_units**0.5 if self._config.scale_embedding else 1.0, name="dst_embedding") # Positional Encoding decoder_output += common_attention.add_timing_signal_1d(decoder_output) # Dropout decoder_output = tf.layers.dropout(decoder_output, rate=residual_dropout_rate, training=is_training) # Bias for preventing peeping later information self_attention_bias = common_attention.attention_bias_lower_triangle( tf.shape(decoder_input)[1]) # Blocks for i in range(self._config.num_blocks_dec): with tf.variable_scope("block_{}".format(i)): # Multihead Attention (self-attention) decoder_output = residual( decoder_output, multihead_attention( query_antecedent=decoder_output, memory_antecedent=None, bias=self_attention_bias, total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, output_depth=self._config.hidden_units, name="decoder_self_attention", summaries=True), dropout_rate=residual_dropout_rate) # Multihead Attention (vanilla attention) decoder_output = residual( decoder_output, multihead_attention( query_antecedent=decoder_output, memory_antecedent=encoder_output, bias=encoder_attention_bias, total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, output_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, name="decoder_vanilla_attention", summaries=True), dropout_rate=residual_dropout_rate) # Feed Forward decoder_output = residual( decoder_output, ff_hidden(decoder_output, hidden_size=4 * self._config.hidden_units, output_size=self._config.hidden_units, activation=self._ff_activation), dropout_rate=residual_dropout_rate) return decoder_output
def tck2emb(self, streamlines: typing.List) -> np.array: return embedding(self.net, streamlines)
def build_network(self): #import ipdb; ipdb.set_trace() config = self.config de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, len(de2idx), num_units=config.hidden_dim, scale=True, scope='enc_embed') ## plus position embedding self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), \ [tf.shape(self.x)[0], 1]), config.maxlen, config.hidden_dim, zero_pad=False, scale=False, scope="enc_pe") self.enc = dropout(self.enc, config.keep_rate, is_train=self.is_train) self.enc_ = self.enc for block_idx in range(config.num_enc_block_1): scope = "encoder_block_{}".format(block_idx) enc_out = conv2d(self.enc, kernel_shape=(config.enc_kernel_width, 1), scope=scope) enc_out = batch_norm(enc_out, is_training=self.is_train, scope="lm" + scope) self.enc = enc_out # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decode_input, len(en2idx), config.hidden_dim, scale=True, scope='dec_embed') ## plus position embedding self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decode_input)[1]), 0), \ [tf.shape(self.decode_input)[0], 1]), config.maxlen, config.hidden_dim, zero_pad=False, scale=False, scope='dec_pe') self.dec_ = self.dec for block_idx in range(config.num_dec_block_1): scope = "decoder_block_conv_{}".format(block_idx) attention_scope = "decoder_block_att_{}".format(block_idx) dec_out = conv2d(self.dec, kernel_shape=(config.dec_kernel_width, 1), causal=True, scope=scope) dec_out = attention_pool(self.enc_, self.dec, enc_out, dec_out, scope=attention_scope) dec_out = dec_out + self.dec dec_out = batch_norm(dec_out, is_training=self.is_train, scope="lm" + scope) self.dec = dec_out with tf.variable_scope('encoder'): for block_idx in range(config.num_enc_block_2): scope = "encoder_block_{}".format(config.num_enc_block_1 + block_idx) enc_out = conv2d(self.enc, kernel_shape=(config.enc_kernel_width, 1), num_outputs=config.hidden_dim_2, scope=scope) enc_out = batch_norm(enc_out, is_training=self.is_train, scope="lm" + scope) self.enc = enc_out with tf.variable_scope('decoder'): for block_idx in range(config.num_dec_block_2): scope = "decoder_block_conv_{}".format(config.num_dec_block_1 + block_idx) attention_scope = "decoder_block_att_{}".format( config.num_dec_block_1 + block_idx) dec_out = conv2d(self.dec, kernel_shape=(config.dec_kernel_width, 1), num_outputs=config.hidden_dim_2, causal=True, scope=scope) dec_out = attention_pool(self.enc_, self.dec, enc_out, dec_out, scope=attention_scope) dec_out = dec_out + self.dec dec_out = batch_norm(dec_out, is_training=self.is_train, scope="lm" + scope) self.dec = dec_out with tf.variable_scope("softmax_layer"): w = tf.get_variable('w', [config.hidden_dim, len(en2idx)]) b = tf.get_variable('b', [len(en2idx)]) w = tf.tile(tf.expand_dims(w, 0), [config.batch_size, 1, 1]) self.logits = tf.matmul(dec_out, w) + b self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget) / tf.reduce_sum(self.istarget) tf.summary.scalar('acc', self.acc) if self.is_train: self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_mean(self.loss) tf.summary.scalar('mean_loss', self.mean_loss) self.tensors = { 'source_sentence': self.enc_, 'target_sentence': self.dec_, 'enc_out': enc_out, 'dec_out': dec_out, 'predictions': self.preds, 'logits': self.logits } if self.is_train: self.tensors['loss'] = self.loss for key, value in self.tensors.items(): tf.summary.histogram(key, value)