def create_rnn_op(self): x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim], dtype='float32', name='x', append_batch_size=False) x.stop_gradient = False h_boot1 = layers.data(shape=[self.batch_size, self.input_dim], dtype='float32', name='h_boot1', append_batch_size=False) h_boot1.stop_gradient = False h_boot2 = layers.data(shape=[self.batch_size, self.input_dim], dtype='float32', name='h_boot2', append_batch_size=False) h_boot2.stop_gradient = False rnn = layers.StaticRNN() with rnn.step(): h_pre1 = rnn.memory(init=h_boot1) h_pre2 = rnn.memory(init=h_boot2) x_t = rnn.step_input(x) mem1 = layers.scale(x=h_pre1, scale=1.0) mem2 = layers.scale(x=h_pre2, scale=1.0) out = layers.sums(input=[mem1, x_t, mem2]) rnn.update_memory(h_pre1, mem1) rnn.update_memory(h_pre2, mem2) rnn.output(out) return rnn()
def forward(self, query, encoder_out, mask=None, last_attended=None): """ Compute contextualized representation and alignment scores. Args: query (Variable): shape(B, T_dec, C_q), dtype float32, the query tensor, where C_q means the query dim. encoder_out (keys, values): keys (Variable): shape(B, T_enc, C_emb), dtype float32, the key representation from an encoder, where C_emb means embed dim. values (Variable): shape(B, T_enc, C_emb), dtype float32, the value representation from an encoder, where C_emb means embed dim. mask (Variable, optional): shape(B, T_enc), dtype float32, mask generated with valid text lengths. Pad tokens corresponds to 1, and valid tokens correspond to 0. last_attended (int, optional): The position that received the most attention at last time step. This is only used at inference. Outpus: x (Variable): shape(B, T_dec, C_q), dtype float32, the contextualized representation from attention mechanism. attn_scores (Variable): shape(B, T_dec, T_enc), dtype float32, the alignment tensor, where T_dec means the number of decoder time steps and T_enc means number the number of decoder time steps. """ keys, values = encoder_out residual = query if self.value_projection: values = self.value_proj(values) if self.key_projection: keys = self.key_proj(keys) x = self.query_proj(query) x = F.matmul(x, keys, transpose_y=True) # mask generated by sentence length neg_inf = -1.e30 if mask is not None: neg_inf_mask = F.scale(F.unsqueeze(mask, [1]), neg_inf) x += neg_inf_mask # if last_attended is provided, focus only on a window range around it # to enforce monotonic attention. if last_attended is not None: locality_mask = np.ones(shape=x.shape, dtype=np.float32) backward, ahead = self.window_range backward = last_attended + backward ahead = last_attended + ahead backward = max(backward, 0) ahead = min(ahead, x.shape[-1]) locality_mask[:, :, backward:ahead] = 0. locality_mask = dg.to_variable(locality_mask) neg_inf_mask = F.scale(locality_mask, neg_inf) x += neg_inf_mask x = F.softmax(x) attn_scores = x x = F.dropout( x, self.dropout, dropout_implementation="upscale_in_train") x = F.matmul(x, values) encoder_length = keys.shape[1] x = F.scale(x, encoder_length * np.sqrt(1.0 / encoder_length)) x = self.out_proj(x) x = F.scale((x + residual), np.sqrt(0.5)) return x, attn_scores
def forward(self, q, k, v, lengths, speaker_embed, start_index, force_monotonic=False, prev_coeffs=None, window=None): # add position encoding as an inductive bias if self.has_bias: # multi-speaker model omega_q = 2 * F.sigmoid( F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1])) omega_k = 2 * self.omega_initial * F.sigmoid(F.squeeze( self.k_pos_affine(speaker_embed), axes=[-1])) else: # single-speaker case batch_size = q.shape[0] omega_q = F.ones((batch_size, ), dtype="float32") omega_k = F.ones((batch_size, ), dtype="float32") * self.omega_default q += self.position_encoding_weight * positional_encoding(q, start_index, omega_q) k += self.position_encoding_weight * positional_encoding(k, 0, omega_k) q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v) activations = F.matmul(q, k, transpose_y=True) activations /= np.sqrt(self.attention_dim) if self.training: # mask the <pad> parts from the encoder mask = F.sequence_mask(lengths, dtype="float32") attn_bias = F.scale(1. - mask, -1000) activations += F.unsqueeze(attn_bias, [1]) elif force_monotonic: assert window is not None backward_step, forward_step = window T_enc = k.shape[1] batch_size, T_dec, _ = q.shape # actually T_dec = 1 here alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \ if prev_coeffs is None \ else F.argmax(prev_coeffs, axis=-1) backward = F.sequence_mask(alpha - backward_step, maxlen=T_enc, dtype="bool") forward = F.sequence_mask(alpha + forward_step, maxlen=T_enc, dtype="bool") mask = F.cast(F.logical_xor(backward, forward), "float32") # print("mask's shape:", mask.shape) attn_bias = F.scale(1. - mask, -1000) activations += attn_bias # softmax coefficients = F.softmax(activations, axis=-1) # context vector coefficients = F.dropout(coefficients, 1. - self.keep_prob, dropout_implementation='upscale_in_train') contexts = F.matmul(coefficients, v) # context normalization enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32") contexts *= F.sqrt(enc_lengths) # out affine contexts = self.out_affine(contexts) return contexts, coefficients
def setUp(self): self.setup_program() self.data_field = {"x", "h_boot"} self.input_shape = (self.sent_len, self.batch_size, self.input_dim) self.output_shape = (self.sent_len, self.batch_size, self.input_dim) self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape) with fluid.program_guard(self.main_program, self.startup_program): x = layers.data( shape=[self.sent_len, self.batch_size, self.input_dim], dtype='float32', name='x', append_batch_size=False) x.stop_gradient = False h_boot = layers.data( shape=[self.input_dim], dtype='float32', name='h_boot') h_boot.stop_gradient = False forward_only_rnn = layers.StaticRNN() with forward_only_rnn.step(): h_pre = forward_only_rnn.memory(init=h_boot) x_t = forward_only_rnn.step_input(x) h = layers.scale( x=layers.elementwise_add( x=h_pre, y=x_t), scale=self.py_rnn.scale) forward_only_rnn.update_memory(h_pre, h) forward_only_rnn.output(h) forward_only_output = forward_only_rnn() forward_only_output.stop_gradient = True self.forward_only_output = layers.mean(forward_only_output) rnn = layers.StaticRNN() with rnn.step(): h_pre = rnn.memory(init=h_boot) x_t = rnn.step_input(x) h = layers.scale( x=layers.elementwise_add( x=h_pre, y=x_t), scale=self.py_rnn.scale) rnn.update_memory(h_pre, h) rnn.output(h) self.output = layers.mean(rnn())
def forward(self, x, speaker_embed=None): """ Args: x (Variable): shape(B, C_in, T), dtype float32, the input of Conv1DGLU layer, where B means batch_size, C_in means the input channels T means input time steps. speaker_embed (Variable): shape(B, C_sp), dtype float32, speaker embed, where C_sp means speaker embedding size. Returns: x (Variable): shape(B, C_out, T), the output of Conv1DGLU, where C_out means the `num_filters`. """ residual = x x = F.dropout(x, self.dropout, dropout_implementation="upscale_in_train") x = self.conv(x) content, gate = F.split(x, num_or_sections=2, dim=1) if speaker_embed is not None: sp = F.softsign(self.fc(speaker_embed)) content = F.elementwise_add(content, sp, axis=0) # glu x = F.sigmoid(gate) * content if self.residual: x = F.scale(x + residual, np.sqrt(0.5)) return x
def add_input(self, x_t, speaker_embed=None): """ Takes a step of inputs and return a step of outputs. It works similarily with the `forward` method, but in a `step-in-step-out` fashion. Args: x_t (Variable): shape(B, C_in, T=1), dtype float32, the input of Conv1DGLU layer, where B means batch_size, C_in means the input channels. speaker_embed (Variable): Shape(B, C_sp), dtype float32, speaker embed, where C_sp means speaker embedding size. Returns: x (Variable): shape(B, C_out), the output of Conv1DGLU, where C_out means the `num_filter`. """ residual = x_t x_t = F.dropout(x_t, self.dropout, dropout_implementation="upscale_in_train") x_t = self.conv.add_input(x_t) content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1) if speaker_embed is not None: sp = F.softsign(self.fc(speaker_embed)) content_t = F.elementwise_add(content_t, sp, axis=0) # glu x_t = F.sigmoid(gate_t) * content_t if self.residual: x_t = F.scale(x_t + residual, np.sqrt(0.5)) return x_t
def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate): """ Scaled Dot-Product Attention """ # FIXME(guosheng): Optimize the shape in reshape_op or softmax_op. # The current implementation of softmax_op only supports 2D tensor, # consequently it cannot be directly used here. # If to use the reshape_op, Besides, the shape of product inferred in # compile-time is not the actual shape in run-time. It cann't be used # to set the attribute of reshape_op. # So, here define the softmax for temporary solution. def __softmax(x, eps=1e-9): exp_out = layers.exp(x=x) sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False) return layers.elementwise_div(x=exp_out, y=sum_out, axis=0) scaled_q = layers.scale(x=q, scale=d_model**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) weights = __softmax(layers.elementwise_add(x=product, y=attn_bias)) if dropout_rate: weights = layers.dropout( weights, dropout_prob=dropout_rate, is_test=False) out = layers.matmul(weights, v) return out
def create_rnn_op(self): x = layers.data( shape=[self.sent_len, self.batch_size, self.input_dim], dtype='float32', name='x', append_batch_size=False, **self.p_info) x.stop_gradient = False h_boot = layers.data( shape=[self.input_dim], dtype='float32', name='h_boot', **self.p_info) h_boot.stop_gradient = False rnn = layers.StaticRNN(main_program=self.main_program) with rnn.step(): h_pre = rnn.memory(init=h_boot) x_t = rnn.step_input(x) h = layers.scale( x=layers.elementwise_add( x=h_pre, y=x_t, **self.p_info), scale=self.py_rnn.scale, **self.p_info) rnn.update_memory(h_pre, h) rnn.output(h) return rnn()
def prepare_encoder( src_word, #[b,t,c] src_pos, src_vocab_size, src_emb_dim, src_max_len, dropout_rate=0., bos_idx=0, word_emb_param_name=None, pos_enc_param_name=None): """Add word embeddings and position encodings. The output tensor has a shape of: [batch_size, max_src_length_in_batch, d_model]. This module is used at the bottom of the encoder stacks. """ src_word_emb = src_word #layers.concat(res,axis=1) src_word_emb = layers.cast(src_word_emb, 'float32') # print("src_word_emb",src_word_emb) src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) src_pos_enc = layers.embedding(src_pos, size=[src_max_len, src_emb_dim], param_attr=fluid.ParamAttr( name=pos_enc_param_name, trainable=False)) src_pos_enc.stop_gradient = True enc_input = src_word_emb + src_pos_enc return layers.dropout( enc_input, dropout_prob=dropout_rate, seed=dropout_seed, is_test=False) if dropout_rate else enc_input
def prepare_encoder(src_word, src_pos, src_vocab_size, src_emb_dim, src_max_len, dropout_rate=0., src_data_shape=None, word_emb_param_name=None, pos_enc_param_name=None): """Add word embeddings and position encodings. The output tensor has a shape of: [batch_size, max_src_length_in_batch, d_model]. This module is used at the bottom of the encoder stacks. """ src_word_emb = layers.embedding(src_word, size=[src_vocab_size, src_emb_dim], param_attr=fluid.ParamAttr( name=word_emb_param_name, initializer=fluid.initializer.Normal( 0., src_emb_dim**-0.5))) src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) src_pos_enc = layers.embedding(src_pos, size=[src_max_len, src_emb_dim], param_attr=fluid.ParamAttr( name=pos_enc_param_name, trainable=False)) enc_input = src_word_emb + src_pos_enc enc_input = layers.reshape(x=enc_input, shape=[batch_size, seq_len, src_emb_dim], actual_shape=src_data_shape) return layers.dropout(enc_input, dropout_prob=dropout_rate, is_test=False) if dropout_rate else enc_input
def __call__(self, msg): alpha = msg["alpha"] # lod-tensor (batch_size, num_heads) if attn_drop: old_h = alpha dropout = F.data(name='attn_drop', shape=[1], dtype="int64") u = L.uniform_random(shape=L.cast(L.shape(alpha)[:1], 'int64'), min=0., max=1.) keeped = L.cast(u > dropout, dtype="float32") self_attn_mask = L.scale(x=keeped, scale=10000.0, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = L.stack(x=[self_attn_mask] * num_heads, axis=1) n_head_self_attn_mask.stop_gradient = True alpha = n_head_self_attn_mask + alpha alpha = L.lod_reset(alpha, old_h) h = msg["v"] alpha = paddle_helper.sequence_softmax(alpha) self.alpha = alpha old_h = h h = h * alpha h = L.lod_reset(h, old_h) h = L.sequence_pool(h, "sum") if concat: h = L.reshape(h, [-1, num_heads * hidden_size]) else: h = L.reduce_mean(h, dim=1) return h
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): """ Scaled Dot-Product Attention [[ 0 L*L -inf -inf -inf ]]maxLen*maxLen """ scaled_q = layers.scale(x=q, scale=d_key**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) if attn_bias: product += attn_bias weights = layers.softmax(product) ############################ # add code layers.Print(attn_bias, message="The content of input layer:") attn_mask = attn_bias == 0 attn_mask = layers.cast(attn_mask, 'float64') layers.Print(weights) weights = layers.elementwise_mul(attn_mask, weights) layers.Print(weights) # weights = layers.elementwise_mul(weights, attn_mask) ############################ if dropout_rate: weights = layers.dropout(weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) out = layers.matmul(weights, v) return out
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): """ Scaled Dot-Product Attention [[ 0 L*L -inf -inf -inf ]]maxLen*maxLen """ scaled_q = layers.scale(x=q, scale=d_key**-0.5) # print("q",q.shape) # print("k",k.shape) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) # print("product",product.shape) if attn_bias: # print('attn_bias',attn_bias.shape) # print(product.shape) # print(product.shape) # print(attn_bias.shape) product += attn_bias weights = layers.softmax(product) # layers.Print(weights) if dropout_rate: weights = layers.dropout(weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) out = layers.matmul(weights, v) return out
def add_input(self, x, condition=None): """Add a step input. This method works similarily with `forward` but in a `step-in-step-out` fashion. Args: x (Variable): shape(B, C_res, T=1), input for a step, dtype float32. condition (Variable, optional): shape(B, C_cond, T=1). condition for a step, dtype float32. Defaults to None. Returns: (residual, skip_connection) residual (Variable): shape(B, C_res, T=1), the residual for a step, which is used as the input to the next layer of ResidualBlock. skip_connection (Variable): shape(B, C_res, T=1), the skip connection for a step. This output is accumulated with that of other ResidualBlocks. """ h = x # dilated conv h = self.conv.add_input(h) # condition if condition is not None: h += self.condition_proj(condition) # gated tanh content, gate = F.split(h, 2, dim=1) z = F.sigmoid(gate) * F.tanh(content) # projection residual = F.scale(z + x, np.sqrt(0.5)) skip_connection = z return residual, skip_connection
def forward(self, x, condition=None): """Conv1D gated-tanh Block. Args: x (Variable): shape(B, C_res, T), the input. (B stands for batch_size, C_res stands for residual channels, T stands for time steps.) dtype float32. condition (Variable, optional): shape(B, C_cond, T), the condition, it has been upsampled in time steps, so it has the same time steps as the input does.(C_cond stands for the condition's channels). Defaults to None. Returns: (residual, skip_connection) residual (Variable): shape(B, C_res, T), the residual, which is used as the input to the next layer of ResidualBlock. skip_connection (Variable): shape(B, C_res, T), the skip connection. This output is accumulated with that of other ResidualBlocks. """ time_steps = x.shape[-1] h = x # dilated conv h = self.conv(h) if h.shape[-1] != time_steps: h = h[:, :, :time_steps] # condition if condition is not None: h += self.condition_proj(condition) # gated tanh content, gate = F.split(h, 2, dim=1) z = F.sigmoid(gate) * F.tanh(content) # projection residual = F.scale(z + x, math.sqrt(.5)) skip_connection = z return residual, skip_connection
def prepare_decoder(src_word, src_pos, src_vocab_size, src_emb_dim, src_max_len, dropout_rate=0., bos_idx=0, word_emb_param_name=None, pos_enc_param_name=None): """Add word embeddings and position encodings. The output tensor has a shape of: [batch_size, max_src_length_in_batch, d_model]. This module is used at the bottom of the encoder stacks. """ src_word_emb = layers.embedding( src_word, size=[src_vocab_size, src_emb_dim], padding_idx=bos_idx, # set embedding of bos to 0 param_attr=fluid.ParamAttr(name=word_emb_param_name, initializer=fluid.initializer.Normal( 0., src_emb_dim**-0.5))) # print("target_word_emb",src_word_emb) src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5) src_pos_enc = layers.embedding(src_pos, size=[src_max_len, src_emb_dim], param_attr=fluid.ParamAttr( name=pos_enc_param_name, trainable=False)) src_pos_enc.stop_gradient = True enc_input = src_word_emb + src_pos_enc return layers.dropout( enc_input, dropout_prob=dropout_rate, seed=dropout_seed, is_test=False) if dropout_rate else enc_input
def forward_attention(indicator, support_x, support_y_embed, support_mask, query_x, query_y, query_mask): """ support_indicator: length = support_len if attention(support, query), indicator = 0 if attention(support, support), indicator = 1 """ support_y_embed = support_y_embed * support_mask support_xy = layers.concat([support_x, support_y_embed, indicator], axis=1) pad_value = layers.assign( input=numpy.array([0.0], dtype=numpy.float32)) support_pad, support_len = layers.sequence_pad(support_xy, pad_value=pad_value) query_pad, query_len = layers.sequence_pad(query_x, pad_value=pad_value) attention = self.attention(query_pad, support_pad, support_pad, self.hidden_dim, 'meta') attention = layers.sequence_unpad(attention, length=query_len) pred_input = layers.concat([attention, query_x], axis=1) pred = self.prepare_preds_with_name(pred_input, 'out_pred') label = layers.cast(query_y, dtype='float32') label = layers.scale(label, scale=0.01) loss = layers.huber_loss(pred, label, 1.0) * query_mask loss = layers.mean(loss) return pred, label, loss
def forward(self, queries, keys, values, attn_bias, past_cache): assert len(queries.shape) == len(keys.shape) == len(values.shape) == 3 #bsz, q_len, q_dim = queries.shape #bsz, k_len, k_dim = keys.shape #bsz, v_len, v_dim = values.shape #assert k_len == v_len q = self.q(queries) k = self.k(keys) v = self.v(values) cache = (k, v) if past_cache is not None: cached_k, cached_v = past_cache k = L.concat([cached_k, k], 1) v = L.concat([cached_v, v], 1) q = L.transpose(L.reshape(q, [0, 0, self.n_head, q.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim] k = L.transpose(L.reshape(k, [0, 0, self.n_head, k.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim] v = L.transpose(L.reshape(v, [0, 0, self.n_head, v.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim] q = L.scale(q, scale=self.d_key ** -0.5) score = L.matmul(q, k, transpose_y=True) if attn_bias is not None: score += attn_bias score = L.softmax(score, use_cudnn=True) score = self.dropout(score) out = L.matmul(score, v) out = L.transpose(out, [0, 2, 1, 3]) out = L.reshape(out, [0, 0, out.shape[2] * out.shape[3]]) out = self.o(out) return out, cache
def attention(self, query_feature, key_feature, value_feature, hidden_dim, name): """ attention """ query_fc = layers.fc(input=query_feature, size=hidden_dim, param_attr=fluid.ParamAttr( name='query_fc_%s' % name, learning_rate=self.fc_lr), act='relu', num_flatten_dims=2) key_fc = layers.fc(input=key_feature, size=hidden_dim, param_attr=fluid.ParamAttr( 'key_fc_%s' % name, learning_rate=self.fc_lr), act='relu', num_flatten_dims=2) value_fc = layers.fc(input=value_feature, size=hidden_dim, param_attr=fluid.ParamAttr( 'value_fc_%s' % name, learning_rate=self.fc_lr), act='relu', num_flatten_dims=2) query_key_mat = layers.matmul(query_fc, key_fc, False, True) query_key_mat = layers.scale(query_key_mat, scale=1.0 / math.sqrt(hidden_dim)) matching_score = layers.softmax(query_key_mat, axis=2) attention = layers.matmul(matching_score, value_fc) attention
def forward(self, input, bias=None, padding=None): """ input: input feature (B, T, C) padding: only used when using causal conv, we pad mannually """ input_dropped = F.dropout(input, 1. - self.keep_prob, dropout_implementation="upscale_in_train") if self.causal: assert padding is not None input_dropped = F.concat([padding, input_dropped], axis=1) hidden = self.conv(input_dropped) if self.has_bias: assert bias is not None transformed_bias = F.softsign(self.bias_affine(bias)) hidden_embedded = hidden + F.unsqueeze(transformed_bias, [1]) else: hidden_embedded = hidden # glu content, gate = F.split(hidden, num_or_sections=2, dim=-1) content = hidden_embedded[:, :, :self.in_channel] hidden = F.sigmoid(gate) * content # # residual hidden = F.scale(input + hidden, math.sqrt(0.5)) return hidden
def forward(self, char_embed, speaker_embed=None): hidden = self.pre_affine(char_embed, speaker_embed) for layer in self.convs: hidden = layer(hidden, speaker_embed) hidden = self.post_affine(hidden, speaker_embed) keys = hidden values = F.scale(char_embed + hidden, np.sqrt(0.5)) return keys, values
def inference_program(): usr_combined_features = get_usr_combined_features() mov_combined_features = get_mov_combined_features() inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features) scale_infer = layers.scale(x=inference, scale=5.0) return scale_infer
def _gen_input(self, token_ids, type_ids, pos_ids, input_mask, aux_emb=None): token_emb_out = layers.embedding( input=token_ids, size=[self.vocab_size, self.emb_size], dtype=self.dtype, param_attr=fluid.ParamAttr(name=self.token_emb_name, initializer=self.param_initializer)) type_emb_out = layers.embedding( input=type_ids, size=[self.type_size, self.emb_size], dtype=self.dtype, param_attr=fluid.ParamAttr(name=self.type_emb_name, initializer=self.param_initializer)) pos_emb_out = layers.embedding( input=pos_ids, size=[self.max_position_seq_len, self.emb_size], dtype=self.dtype, param_attr=fluid.ParamAttr(name=self.pos_emb_name, initializer=self.param_initializer)) emb_out = token_emb_out + type_emb_out + pos_emb_out # auxiliary memory embeddings if aux_emb is not None: emb_out = layers.concat([aux_emb, emb_out], axis=1) # post process of embedding emb_out = pre_process_layer(emb_out, self.pre_encoder_cmd, self.prepostprocess_dropout, name="pre_encoder", epsilon=self.epsilon) if self.emb_mapping_in: emb_out = layers.fc(input=emb_out, num_flatten_dims=2, size=self.hidden_size, param_attr=fluid.ParamAttr( name="emb_hidden_mapping", initializer=self.param_initializer), bias_attr="emb_hidden_mapping_bias") # generate n-head self-attention mask self_attn_mask = input_mask self_attn_mask = layers.scale(x=self_attn_mask, scale=1e4, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = layers.stack(x=[self_attn_mask] * self.n_head, axis=1) n_head_self_attn_mask.stop_gradient = True return emb_out, n_head_self_attn_mask
def _gen_dec_input(self, trg_word, trg_pos, trg_slf_attn_bias, trg_src_words_attn_bias, trg_src_sents_attn_bias, graph_attn_bias): emb_out = fluid.layers.embedding( input=trg_word, size=[self.voc_size, self._emb_size], padding_idx=self._padding_idx, # set embedding of pad to 0 dtype=self._emb_dtype, param_attr=fluid.ParamAttr(name=self._word_emb_name, initializer=self._param_initializer), is_sparse=False) emb_out = layers.scale(x=emb_out, scale=self._emb_size**0.5) position_emb_out = fluid.layers.embedding( input=trg_pos, size=[self._max_position_seq_len, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr(name=self._dec_word_pos_emb_name, trainable=False)) position_emb_out.stop_gradient = True emb_out = emb_out + position_emb_out emb_out = layers.dropout( emb_out, dropout_prob=self._prepostprocess_dropout, dropout_implementation="upscale_in_train", is_test=False) if self._prepostprocess_dropout else emb_out if self._dtype is "float16": emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype) if trg_slf_attn_bias is not None: trg_slf_attn_bias = fluid.layers.cast(x=trg_slf_attn_bias, dtype=self._dtype) if trg_src_words_attn_bias is not None: trg_src_words_attn_bias = fluid.layers.cast( x=trg_src_words_attn_bias, dtype=self._dtype) if trg_src_sents_attn_bias is not None: trg_src_sents_attn_bias = fluid.layers.cast( x=trg_src_sents_attn_bias, dtype=self._dtype) if graph_attn_bias is not None: graph_attn_bias = fluid.layers.cast(x=graph_attn_bias, dtype=self._dtype) res = namedtuple('results', [ 'emb_out', 'trg_slf_attn_bias', 'trg_src_words_attn_bias', 'trg_src_sents_attn_bias', 'graph_attn_bias' ]) return res(emb_out=emb_out, trg_slf_attn_bias=trg_slf_attn_bias, trg_src_words_attn_bias=trg_src_words_attn_bias, trg_src_sents_attn_bias=trg_src_sents_attn_bias, graph_attn_bias=graph_attn_bias)
def forward(self, src_word, src_pos, src_slf_attn_bias): word_emb = self.word_embedder(src_word) word_emb = layers.scale(x=word_emb, scale=self.emb_dim**0.5) pos_enc = self.pos_encoder(src_pos) pos_enc.stop_gradient = True emb = word_emb + pos_enc enc_input = layers.dropout(emb, dropout_prob=self.emb_dropout, is_test=False) if self.emb_dropout else emb enc_output = self.encoder(enc_input, src_slf_attn_bias) return enc_output
def spec_loss(self, decoded, input, num_frames=None): if num_frames is None: l1_loss = F.reduce_mean(F.abs(decoded - input)) else: # mask the <pad> part of the decoder num_channels = decoded.shape[-1] l1_loss = F.abs(decoded - input) mask = F.sequence_mask(num_frames, dtype="float32") l1_loss *= F.unsqueeze(mask, axes=[-1]) l1_loss = F.reduce_sum(l1_loss) / F.scale(F.reduce_sum(mask), num_channels) return l1_loss
def init_serv(self, place): main = fluid.Program() with fluid.program_guard(main): serv = layers.ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False) with serv.do(): out_var = main.global_block().create_var(name="scale_0.tmp_0", psersistable=True, dtype="float32", shape=[32, 32]) x = layers.data(shape=[32, 32], dtype='float32', name="X", append_batch_size=False) fluid.initializer.Constant(value=1.0)(x, main.global_block()) layers.scale(x=x, scale=10.0, out=out_var) self.server_exe = fluid.Executor(place) self.server_exe.run(main)
def run_local(self, place): main = fluid.Program() with fluid.program_guard(main): x = layers.data(shape=[32, 32], dtype='float32', name='X', append_batch_size=False) fluid.initializer.Constant(value=2.3)(x, main.global_block()) o = layers.scale(x=x, scale=10.0) exe = fluid.Executor(place) self.local_out = exe.run(main, fetch_list=[o])
def forward(self, inputs, keys, values, lengths, start_index, speaker_embed=None, state=None, force_monotonic_attention=None, coeffs=None, window=(0, 4)): hidden = inputs for layer in self.prenet: hidden = layer(hidden, speaker_embed) attentions = [] # every layer of (B, T_dec, T_enc) attention final_state = [] # layers * (B, (k-1)d, C_dec) batch_size = inputs.shape[0] causal_padding_shape = (batch_size, self.kernel_size - 1, self.decoder_dim) for i in range(len(self.causal_convs)): if state is None: padding = F.zeros(causal_padding_shape, dtype="float32") else: padding = state[i] new_state = F.concat([padding, hidden], axis=1) # => to be used next step # causal conv, (B, T, C) hidden = self.causal_convs[i](hidden, speaker_embed, padding=padding) # attn prev_coeffs = None if coeffs is None else coeffs[i] force_monotonic = False if force_monotonic_attention is None else force_monotonic_attention[ i] context, attention = self.attention_blocks[i]( hidden, keys, values, lengths, speaker_embed, start_index, force_monotonic, prev_coeffs, window) # residual connextion (B, T_dec, C_dec) hidden = F.scale(hidden + context, np.sqrt(0.5)) attentions.append(attention) # layers * (B, T_dec, T_enc) # new state: shift a step, layers * (B, T, C) new_state = new_state[:, -(self.kernel_size - 1):, :] final_state.append(new_state) # predict mel spectrogram (B, 1, T_dec, r * C_in) decoded = self.out_affine(hidden) if self.has_bias: decoded *= F.sigmoid( F.unsqueeze(self.out_sp_affine(speaker_embed), [1])) return decoded, hidden, attentions, final_state
def run_local(self, place): main = fluid.Program() with fluid.program_guard(main): x = layers.data( shape=[32, 32], dtype='float32', name='X', append_batch_size=False) fluid.initializer.Constant(value=2.3)(x, main.global_block()) o = layers.scale(x=x, scale=10.0) exe = fluid.Executor(place) self.local_out = exe.run(main, fetch_list=[o])
def get_program(layer, input_spec, output_spec, **configs): paddle.jit.set_verbosity(0) prog_translator = program_translator.ProgramTranslator() if not prog_translator.enable_to_static: raise RuntimeError( "The paddle.jit.save doesn't work when setting ProgramTranslator.enable to False." ) if isinstance(layer, Layer): if isinstance(layer.forward, program_translator.StaticFunction): concrete_program = layer.forward.concrete_program else: # transform in jit.save, if input_spec is incomplete, declarative will throw error layer = paddle.jit.to_static(layer, input_spec=input_spec) concrete_program = layer.forward.concrete_program # the input_spec has been used in declarative, which is equal to # @declarative with input_spec and jit.save without input_spec, # avoid needless warning input_spec = None else: raise TypeError( "The input Layer should be 'Layer', but received type is %s." % type(layer)) feed_var_names = paddle.fluid.dygraph.jit._get_input_var_names( concrete_program.inputs, input_spec) target_vars = paddle.fluid.dygraph.jit._get_output_vars( concrete_program.outputs, output_spec) main_program = concrete_program.main_program.clone() with program_guard(main_program): uniq_target_vars = [] for i, var in enumerate(target_vars): if isinstance(var, Variable): var = layers.scale(var, 1., name="save_infer_model/scale_{}".format(i)) uniq_target_vars.append(var) target_vars = uniq_target_vars global_block = main_program.global_block() need_to_remove_op_index = [] for i, op in enumerate(global_block.ops): op.desc.set_is_target(False) if op.type == "feed" or op.type == "fetch": need_to_remove_op_index.append(i) for index in need_to_remove_op_index[::-1]: global_block._remove_op(index) main_program.desc.flush() main_program = main_program._prune_with_input( feeded_var_names=feed_var_names, targets=target_vars) main_program = main_program._inference_optimize(prune_read_op=True) fetch_var_names = [v.name for v in target_vars] prepend_feed_ops(main_program, feed_var_names) append_fetch_ops(main_program, fetch_var_names) return main_program, feed_var_names, target_vars
def model(): usr_combined_features = get_usr_combined_features() mov_combined_features = get_mov_combined_features() # need cos sim inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features) scale_infer = layers.scale(x=inference, scale=5.0) label = layers.data(name='score', shape=[1], dtype='float32') square_cost = layers.square_error_cost(input=scale_infer, label=label) avg_cost = layers.mean(square_cost) return scale_infer, avg_cost
def init_serv(self, place): main = fluid.Program() with fluid.program_guard(main): serv = layers.ListenAndServ( "127.0.0.1:0", ["X"], optimizer_mode=False) with serv.do(): out_var = main.global_block().create_var( name="scale_0.tmp_0", psersistable=True, dtype="float32", shape=[32, 32]) x = layers.data( shape=[32, 32], dtype='float32', name="X", append_batch_size=False) fluid.initializer.Constant(value=1.0)(x, main.global_block()) layers.scale(x=x, scale=10.0, out=out_var) self.server_exe = fluid.Executor(place) self.server_exe.run(main)
def create_rnn_op(self): x = layers.data( shape=[self.sent_len, self.batch_size, self.input_dim], dtype='float32', name='x', append_batch_size=False, **self.p_info) x.stop_gradient = False h_boot1 = layers.data( shape=[self.batch_size, self.input_dim], dtype='float32', name='h_boot1', append_batch_size=False, **self.p_info) h_boot1.stop_gradient = False h_boot2 = layers.data( shape=[self.batch_size, self.input_dim], dtype='float32', name='h_boot2', append_batch_size=False, **self.p_info) h_boot2.stop_gradient = False rnn = layers.StaticRNN(main_program=self.main_program) with rnn.step(): h_pre1 = rnn.memory(init=h_boot1) h_pre2 = rnn.memory(init=h_boot2) x_t = rnn.step_input(x) mem1 = layers.scale(x=h_pre1, scale=1.0, **self.p_info) mem2 = layers.scale(x=h_pre2, scale=1.0, **self.p_info) out = layers.sums(input=[mem1, x_t, mem2], **self.p_info) rnn.update_memory(h_pre1, mem1) rnn.update_memory(h_pre2, mem2) rnn.output(out) return rnn()
def test_read_write(self): x = [ layers.data( name='x0', shape=[100]), layers.data( name='x1', shape=[100]), layers.data( name='x2', shape=[100]) ] for each_x in x: each_x.stop_gradient = False i = layers.zeros(shape=[1], dtype='int64') i.stop_gradient = False arr = layers.array_write(x=x[0], i=i) i = layers.increment(x=i) arr = layers.array_write(x=x[1], i=i, array=arr) i = layers.increment(x=i) arr = layers.array_write(x=x[2], i=i, array=arr) i = layers.zeros(shape=[1], dtype='int64') i.stop_gradient = False a0 = layers.array_read(array=arr, i=i) i = layers.increment(x=i) a1 = layers.array_read(array=arr, i=i) i = layers.increment(x=i) a2 = layers.array_read(array=arr, i=i) mean_a0 = layers.mean(a0) mean_a1 = layers.mean(a1) mean_a2 = layers.mean(a2) a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2]) mean_x0 = layers.mean(x[0]) mean_x1 = layers.mean(x[1]) mean_x2 = layers.mean(x[2]) x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2]) scope = core.Scope() cpu = core.CPUPlace() exe = Executor(cpu) tensor = numpy.random.random(size=(100, 100)).astype('float32') outs = exe.run(feed={'x0': tensor, 'x1': tensor, 'x2': tensor}, fetch_list=[a_sum, x_sum], scope=scope) self.assertEqual(outs[0], outs[1]) total_sum = layers.sums(input=[a_sum, x_sum]) total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0) append_backward(total_sum_scaled) g_vars = map(default_main_program().global_block().var, [each_x.name + "@GRAD" for each_x in x]) g_out = [ item.sum() for item in exe.run( feed={'x0': tensor, 'x1': tensor, 'x2': tensor}, fetch_list=g_vars) ] g_out_sum = numpy.array(g_out).sum() # since our final gradient is 1 and the neural network are all linear # with mean_op. # the input gradient should also be 1 self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)