def decoder_step(gru_unit, cue_gru_unit, step_in, hidden, input_size, hidden_size, memory, memory_mask, knowledge, mask=None): """ decoder step """ # get attention out # get hidden top layers top_hidden = layers.slice(hidden, axes=[0], starts=[0], ends=[1]) top_hidden = layers.squeeze(top_hidden, axes=[0]) top_hidden = layers.unsqueeze(top_hidden, axes=[1]) weight_memory, attn = dot_attention(top_hidden, memory, memory_mask) step_in = layers.unsqueeze(step_in, axes=[1]) rnn_input_list = [step_in, weight_memory] if weight_memory.shape[0] == -1: knowledge_1 = layers.reshape(knowledge, shape=weight_memory.shape) else: knowledge_1 = knowledge cue_input_list = [knowledge_1, weight_memory] output_list = [weight_memory] rnn_input = layers.concat(rnn_input_list, axis=2) rnn_input = layers.squeeze(rnn_input, axes=[1]) rnn_output, rnn_last_hidden = gru_unit(rnn_input, hidden, mask) cue_input = layers.concat(cue_input_list, axis=2) cue_input = layers.squeeze(cue_input, axes=[1]) cue_rnn_out, cue_rnn_last_hidden = cue_gru_unit(cue_input, hidden, mask) h_y = layers.tanh( fc(rnn_last_hidden, hidden_size, hidden_size, name="dec_fc1")) h_cue = layers.tanh( fc(cue_rnn_last_hidden, hidden_size, hidden_size, name="dec_fc2")) concate_y_cue = layers.concat([h_y, h_cue], axis=2) k = layers.sigmoid(fc(concate_y_cue, hidden_size * 2, 1, name='dec_fc3')) new_hidden = h_y * k - h_cue * (k - 1.0) new_hidden_tmp = layers.transpose(new_hidden, perm=[1, 0, 2]) output_list.append(new_hidden_tmp) real_out = layers.concat(output_list, axis=2) if mask: mask_tmp = layers.unsqueeze(mask, axes=[0]) new_hidden = layers.elementwise_mul((new_hidden - hidden), mask_tmp, axis=0) new_hidden += hidden return real_out, new_hidden
def forward(self, audio, mel, audio_start, clip_kl=True): """Compute loss of Clarinet model. Args: audio (Variable): shape(B, T_audio), dtype flaot32, ground truth waveform. mel (Variable): shape(B, F, T_mel), dtype flaot32, condition(mel spectrogram here). audio_start (Variable): shape(B, ), dtype int64, audio starts positions. clip_kl (bool, optional): whether to clip kl_loss by maximum=100. Defaults to True. Returns: Dict(str, Variable) loss (Variable): shape(1, ), dtype flaot32, total loss. kl (Variable): shape(1, ), dtype flaot32, kl divergence between the teacher's output distribution and student's output distribution. regularization (Variable): shape(1, ), dtype flaot32, a regularization term of the KL divergence. spectrogram_frame_loss (Variable): shape(1, ), dytpe: float, stft loss, the L1-distance of the magnitudes of the spectrograms of the ground truth waveform and synthesized waveform. """ batch_size, audio_length = audio.shape # audio clip's length z = F.gaussian_random(audio.shape) condition = self.encoder(mel) # (B, C, T) condition_slice = crop(condition, audio_start, audio_length) x, s_means, s_scales = self.student(z, condition_slice) # all [0: T] s_means = s_means[:, 1:] # (B, T-1), time steps [1: T] s_scales = s_scales[:, 1:] # (B, T-1), time steps [1: T] s_clipped_scales = F.clip(s_scales, self.min_log_scale, 100.) # teacher outputs single gaussian y = self.teacher(x[:, :-1], condition_slice[:, :, 1:]) _, t_means, t_scales = F.split(y, 3, -1) # time steps [1: T] t_means = F.squeeze(t_means, [-1]) # (B, T-1), time steps [1: T] t_scales = F.squeeze(t_scales, [-1]) # (B, T-1), time steps [1: T] t_clipped_scales = F.clip(t_scales, self.min_log_scale, 100.) s_distribution = D.Normal(s_means, F.exp(s_clipped_scales)) t_distribution = D.Normal(t_means, F.exp(t_clipped_scales)) # kl divergence loss, so we only need to sample once? no MC kl = s_distribution.kl_divergence(t_distribution) if clip_kl: kl = F.clip(kl, -100., 10.) # context size dropped kl = F.reduce_mean(kl[:, self.teacher.context_size:]) # major diff here regularization = F.mse_loss(t_scales[:, self.teacher.context_size:], s_scales[:, self.teacher.context_size:]) # introduce information from real target spectrogram_frame_loss = F.mse_loss(self.stft.magnitude(audio), self.stft.magnitude(x)) loss = kl + self.lmd * regularization + spectrogram_frame_loss loss_dict = { "loss": loss, "kl_divergence": kl, "regularization": regularization, "stft_loss": spectrogram_frame_loss } return loss_dict
def forward(self, q, k, v, lengths, speaker_embed, start_index, force_monotonic=False, prev_coeffs=None, window=None): # add position encoding as an inductive bias if self.has_bias: # multi-speaker model omega_q = 2 * F.sigmoid( F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1])) omega_k = 2 * self.omega_initial * F.sigmoid(F.squeeze( self.k_pos_affine(speaker_embed), axes=[-1])) else: # single-speaker case batch_size = q.shape[0] omega_q = F.ones((batch_size, ), dtype="float32") omega_k = F.ones((batch_size, ), dtype="float32") * self.omega_default q += self.position_encoding_weight * positional_encoding(q, start_index, omega_q) k += self.position_encoding_weight * positional_encoding(k, 0, omega_k) q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v) activations = F.matmul(q, k, transpose_y=True) activations /= np.sqrt(self.attention_dim) if self.training: # mask the <pad> parts from the encoder mask = F.sequence_mask(lengths, dtype="float32") attn_bias = F.scale(1. - mask, -1000) activations += F.unsqueeze(attn_bias, [1]) elif force_monotonic: assert window is not None backward_step, forward_step = window T_enc = k.shape[1] batch_size, T_dec, _ = q.shape # actually T_dec = 1 here alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \ if prev_coeffs is None \ else F.argmax(prev_coeffs, axis=-1) backward = F.sequence_mask(alpha - backward_step, maxlen=T_enc, dtype="bool") forward = F.sequence_mask(alpha + forward_step, maxlen=T_enc, dtype="bool") mask = F.cast(F.logical_xor(backward, forward), "float32") # print("mask's shape:", mask.shape) attn_bias = F.scale(1. - mask, -1000) activations += attn_bias # softmax coefficients = F.softmax(activations, axis=-1) # context vector coefficients = F.dropout(coefficients, 1. - self.keep_prob, dropout_implementation='upscale_in_train') contexts = F.matmul(coefficients, v) # context normalization enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32") contexts *= F.sqrt(enc_lengths) # out affine contexts = self.out_affine(contexts) return contexts, coefficients
def forward(self, img, label, mask=None, return_loss=True): outs = self.backbone(img) cls_out = self.avgpool(outs[-1]) if return_loss: cls_out = L.dropout(cls_out, dropout_prob=self.dropout, is_test=False) cls_out = self.fc(L.squeeze(cls_out, axes=[2, 3])) losses = self.get_losses(outs, cls_out, mask, label) return losses else: cls_out = self.fc(L.squeeze(cls_out, axes=[2, 3])) cls_out = L.softmax(cls_out).numpy()[:, 0] return cls_out
def forward(self, encoder_output): """ Predict the duration of each character. Args: encoder_output (Variable): shape(B, T, C), dtype float32, the encoder output. Returns: out (Variable): shape(B, T, C), the output of duration predictor. """ # encoder_output.shape(N, T, C) out = layers.transpose(encoder_output, [0, 2, 1]) out = self.conv1(out) out = layers.transpose(out, [0, 2, 1]) out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout, dropout_implementation='upscale_in_train') out = layers.transpose(out, [0, 2, 1]) out = self.conv2(out) out = layers.transpose(out, [0, 2, 1]) out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout, dropout_implementation='upscale_in_train') out = layers.relu(self.linear(out)) out = layers.squeeze(out, axes=[-1]) return out
def test_squeeze(self): program = Program() with program_guard(program): x = layers.data(name='x', shape=[1, 1, 4], dtype='float32') out = layers.squeeze(input=x, axes=[2]) self.assertIsNotNone(out) print(str(program))
def epoch_predict(env, args, model, loader): """Predict in one epoch""" model.eval() arcs, rels, probs = [], [], [] for words, feats in loader(): # ignore the first token of each sentence tmp_words = layers.pad(words[:, 1:], paddings=[0, 0, 1, 0], pad_value=args.pad_index) mask = tmp_words != args.pad_index lens = nn.reduce_sum(mask, -1) s_arc, s_rel = model(words, feats) arc_preds, rel_preds = decode(args, s_arc, s_rel, mask) arcs.extend( layers.split(nn.masked_select(arc_preds, mask), lens.numpy().tolist())) rels.extend( layers.split(nn.masked_select(rel_preds, mask), lens.numpy().tolist())) if args.prob: arc_probs = nn.index_sample(layers.softmax(s_arc, -1), layers.unsqueeze(arc_preds, -1)) probs.extend( layers.split( nn.masked_select(layers.squeeze(arc_probs, axes=[-1]), mask), lens.numpy().tolist())) arcs = [seq.numpy().tolist() for seq in arcs] rels = [env.REL.vocab[seq.numpy().tolist()] for seq in rels] probs = [[round(p, 3) for p in seq.numpy().tolist()] for seq in probs] return arcs, rels, probs
def forward(self, seq): seq = layers.transpose(seq, [0, 2, 1]) seq = layers.unsqueeze(seq, -1) seq = self.conv2d(seq) seq = layers.squeeze(seq, [-1]) seq = layers.transpose(seq, [0, 2, 1]) return seq
def create_loss_op(self, predict, label, epsilon=1e-7): """compute loss with tensor Args: predict: model output tensor activated by softmax label: a non-sparse tensor Returns: loss: cross-entropy loss """ if self.loss_type == "nl" and self.model_type == "train": one_hot_label = fluid.one_hot(label, depth=predict.shape[-1]) one_hot_label = FL.squeeze(one_hot_label, axes=[-2]) # log neg_prob = 1 - predict log_neg_prob = FL.log( fluid.layers.clip(neg_prob, min=epsilon, max=1.)) ce_loss = -1 * log_neg_prob * one_hot_label cost = FL.reduce_sum(ce_loss, dim=-1, keep_dim=True) else: # PL or evaluation cost = FL.cross_entropy(predict, label) loss = FL.mean(cost) return loss
def forward(self, *args, **kwargs): """ Args: labels (optional, `Variable` of shape [batch_size, seq_len]): ground truth label id for each token Returns: loss (`Variable` of shape []): Cross entropy loss mean over batch and time, ignore positions where label == -100 if labels not set, returns None logits (`Variable` of shape [batch_size, seq_len, hidden_size]): output logits of classifier loss_weights (`Variable` of shape [batch_size, seq_len]): weigths of loss for each tokens. ignore_index (int): when label == `ignore_index`, this token will not contribute to loss """ ignore_index = kwargs.pop('ignore_index', -100) labels = kwargs.pop('labels', None) loss_weights = kwargs.pop('loss_weights', None) pooled, encoded = super(ErnieModelForTokenClassification, self).forward(*args, **kwargs) hidden = self.dropout(encoded) # maybe not? logits = self.classifier(hidden) if labels is not None: if len(labels.shape) == 2: labels = L.unsqueeze(labels, axes=[-1]) loss = L.softmax_with_cross_entropy(logits, labels, ignore_index=ignore_index) if loss_weights is not None: loss = L.squeeze(loss, [-1]) * loss_weights loss = L.reduce_mean(loss) else: loss = None return loss, logits
def get_metrics(self, inputs, outputs): """Get metrics.""" metrics = {} pooled_out = self._get_pooled_output(outputs["enc_out"]) cls_logits = self._get_classifier_output(pooled_out, num_classes=self.num_classes, name="cls") cls_loss, cls_softmax = layers.softmax_with_cross_entropy( logits=cls_logits, label=inputs["label"], return_softmax=True) cls_acc = layers.accuracy(cls_softmax, inputs["label"]) mean_cls_loss = layers.mean(cls_loss) metrics["loss"] = mean_cls_loss metrics["cls_loss"] = mean_cls_loss metrics["cls_acc"] = cls_acc # statistics for recall & precision & f1 if self.num_classes == 2: pred = layers.argmax(cls_softmax, axis=1) label = layers.squeeze(inputs["label"], axes=[1]) metrics["stat_tp"] = layers.reduce_sum( layers.logical_and(pred == 1, label == 1).astype("float32")) metrics["stat_fp"] = layers.reduce_sum( layers.logical_and(pred == 1, label == 0).astype("float32")) metrics["stat_tn"] = layers.reduce_sum( layers.logical_and(pred == 0, label == 0).astype("float32")) metrics["stat_fn"] = layers.reduce_sum( layers.logical_and(pred == 0, label == 1).astype("float32")) return metrics
def attention(self, hidden, encoder_output, encoder_output_proj, encoder_padding_mask): # 定义attention用以计算context,即 c_i,这里使用Bahdanau attention机制 decoder_state_proj = layers.unsqueeze( layers.fc(hidden, size=self.hidden_size, bias_attr=False), [1]) # 拿解码器的一个向量,和编码器的所有输出,进行一个结合/混合/融合/交融/关联 mixed_state = fluid.layers.elementwise_add( encoder_output_proj, layers.expand(decoder_state_proj, [1, layers.shape(decoder_state_proj)[1], 1])) # 解码器的一个向量,和编码器的所有输出,进行一个结合/混合/融合/交融/关联 后,进行全连接转成一个数值关系 attn_scores = layers.squeeze( layers.fc(input=mixed_state, size=1, num_flatten_dims=2, bias_attr=False), [2]) if encoder_padding_mask is not None: attn_scores = layers.elementwise_add(attn_scores, encoder_padding_mask) # 数值关系softmax,变成了权重关系 attn_scores = layers.softmax(attn_scores) # 加权平均权重,就是解码器的一个向量一顿操作后,拿到的上下文向量 context = layers.reduce_sum(layers.elementwise_mul(encoder_output, attn_scores, axis=0), dim=1) return context
def _get_pooled_output(self, enc_out, idx=None, name="pooled"): """Get pooled output of the last output embedding in Transformer. Args: enc_out: the output embeddings of Transformer, shape is [batch_size, max_seq_len, hidden_size] idx (optional): the selected indices in pooling operator, shape is [batch_size, 1] or [batch_size, 2]. name: a string, the name of the pooling layer. Returns: pooled_out: the pooled output embedding, shape is [batch_size, hidden_size]. """ if idx is None: feat = enc_out[:, 0] elif len(idx.shape) == 2 and idx.shape[1] == 1: enc_out = layers.squeeze(enc_out, [1]) feat = layers.gather(input=enc_out, index=idx) elif len(idx.shape) == 2 and idx.shape[1] == 2: feat = layers.gather_nd(input=enc_out, index=idx) else: raise ValueError(f"Invalid indices shape {idx.shape} is used") pooled_out = layers.fc( input=feat, size=self.hidden_size, act="tanh", param_attr=fluid.ParamAttr(name=f"{name}_fc.w_0", initializer=self.param_initializer), bias_attr=f"{name}_fc.b_0") return pooled_out
def decoder_step(currrent_in, pre_feed, pre_hidden_array, pre_cell_array, enc_memory): new_hidden_array = [] new_cell_array = [] step_input = layers.concat([currrent_in, pre_feed], 1) for i in range(self.num_layers): pre_hidden = pre_hidden_array[i] pre_cell = pre_cell_array[i] new_hidden, new_cell = dec_unit_list[i](step_input, pre_hidden, pre_cell) new_hidden_array.append(new_hidden) new_cell_array.append(new_cell) step_input = new_hidden memory_mask = src_mask - 1.0 enc_memory = layers.matmul(enc_memory, memory_weight) att_in = layers.unsqueeze(step_input, [1]) dec_att, _ = dot_attention(att_in, enc_memory) dec_att = layers.squeeze(dec_att, [1]) concat_att_out = layers.concat([dec_att, step_input], 1) concat_att_out = layers.matmul(concat_att_out, attention_weight) return concat_att_out, new_hidden_array, new_cell_array
def forward(self, cue, label, return_loss=True): out = self.conv1(cue) out = self.norm1(out) out = self.maxpool(out) out = self.conv2(out) out = self.norm2(out) out = self.avgpool(out) if return_loss: cls_out = L.dropout(out, dropout_prob=0.5, is_test=False) cls_out = self.fc(L.squeeze(cls_out, axes=[2, 3])) loss_cls = L.mean(L.cross_entropy(cls_out, label)) losses = dict(loss_cls=loss_cls, loss=loss_cls) return losses else: cls_out = self.fc(L.squeeze(out, axes=[2, 3])) cls_out = L.softmax(cls_out).numpy()[:, 0] return cls_out
def encoder_1(x_emb, vocab_size, emb_size, init_hidden=None, init_cell=None, para_name='', args=None): rnn_input = x_emb #rnn_input.stop_gradient = True rnn_outs = [] rnn_outs_ori = [] cells = [] projs = [] num_layers = 2 for i in range(num_layers): #rnn_input = dropout(rnn_input, False, args) if init_hidden and init_cell: h0 = layers.squeeze(layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) c0 = layers.squeeze(layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) else: h0 = c0 = None rnn_out, cell, input_proj = lstmp_encoder( rnn_input, hidden_size, h0, c0, para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args) rnn_out_ori = rnn_out if i > 0: rnn_out = rnn_out + rnn_input #rnn_out = dropout(rnn_out, test_mode, args) rnn_out.stop_gradient = True rnn_outs.append(rnn_out) #rnn_outs_ori.stop_gradient = True rnn_outs_ori.append(rnn_out_ori) #ipdb.set_trace() #layers.Print(input_seq, message='input_seq', summarize=10) #layers.Print(rnn_outs[-1], message='rnn_outs', summarize=10) return rnn_outs[-1], rnn_outs_ori
def get_losses(self, out, cls_out, mask, gt_labels): loss_cls = L.mean(L.cross_entropy(cls_out, gt_labels)) * self.train_cfg['w_cls'] loss_tir = 0 for feat in out[:-1]: feat = L.squeeze(self.avgpool(feat), axes=[2, 3]) loss_tir += self.triple_loss(feat, gt_labels) * self.train_cfg['w_tri'] loss = loss_cls + loss_tir return dict(loss_cls=loss_cls, loss_tir=loss_tir, loss=loss)
def rnn_decoder(gru_unit, cue_gru_unit, input, input_size, hidden_size, num_layers, memory, memory_mask, knowledge, output_size, init_hidden=None, mask=None, dropout=0.0, batch_first=True, name="decoder"): """ rnn decoder """ input_emb = get_embedding(input, input_size, output_size) if batch_first: input_emb = layers.transpose(input_emb, perm=[1, 0, 2]) if mask: trans_mask = layers.transpose(mask, perm=[1, 0]) rnn = PaddingRNN() with rnn.step(): step_in = rnn.step_input(input_emb) step_mask = None if mask: step_mask = rnn.step_input(trans_mask) # split pre_hidden pre_hidden_list = [] pre_hidden = rnn.memory(init=init_hidden) real_out, last_hidden = \ decoder_step(gru_unit, cue_gru_unit, step_in, pre_hidden, input_size, hidden_size, memory, memory_mask, knowledge, mask=step_mask) rnn.update_memory(pre_hidden, last_hidden) step_in = layers.squeeze(real_out, axes=[1]) rnn.step_output(step_in) rnnout = rnn() rnnout = layers.transpose(rnnout, perm=[1, 0, 2]) rnnout = layers.elementwise_mul(rnnout, mask, axis=0) output_in_size = hidden_size + hidden_size rnnout = layers.dropout(rnnout, dropout_prob=dropout) rnnout = fc(rnnout, output_in_size, hidden_size, name='dec_out_fc1') rnnout = fc(rnnout, hidden_size, output_size, name='dec_out_fc2') softmax_out = layers.softmax(rnnout) return softmax_out
def _calc_bow_logits(self, enc_out, bow_idx): """Get the logits of BoW task. The network may share weight with token embeddings. Args: enc_out: the output embeddings of Transformer, shape is [batch_size, max_seq_len, hidden_dim] bow_idx: the indices of prediction tokens, shape is [num_predictions, 1] or [num_predictions, 2]. Returns: logits: the logits of prediction task, shape is [num_predictions, vocab_size]. """ if len(bow_idx.shape) == 2 and bow_idx.shape[1] == 1: enc_out = layers.squeeze(enc_out, [1]) bow_feat = layers.gather(input=enc_out, index=bow_idx, overwrite=False) elif len(bow_idx.shape) == 2 and bow_idx.shape[1] == 2: bow_feat = layers.gather_nd(input=enc_out, index=bow_idx) else: raise ValueError(f"Invalid indices shape {bow_idx.shape} is used") bow_trans_feat = layers.fc( input=bow_feat, size=self.emb_size, act=self.hidden_act, param_attr=fluid.ParamAttr( name="bow_trans_fc.w_0", initializer=self.param_initializer), bias_attr="bow_trans_fc.b_0") bow_trans_feat = pre_process_layer( bow_trans_feat, self.post_cls_cmd, name="bow_trans") if self.weight_sharing: bow_logits = layers.matmul( x=bow_trans_feat, y=fluid.default_main_program().global_block().var( self.token_emb_name), transpose_y=True) if self.cls_bias: bow_logits += layers.create_parameter( shape=[self.vocab_size], dtype=self.dtype, attr=fluid.ParamAttr(name="bow_out_fc.b_0"), is_bias=True) else: bow_out_bias_attr = "bow_out_fc.b_0" if self.cls_bias else False bow_logits = layers.fc(input=bow_trans_feat, size=self.vocab_size, param_attr=fluid.ParamAttr( name="bow_out_fc.w_0", initializer=self.param_initializer), bias_attr=bow_out_bias_attr) return bow_logits
def encoder_wrapper(x_emb, vocab_size, emb_size, init_hidden=None, init_cell=None, para_name='', args=None): """ encoder_wrapper """ rnn_input = x_emb rnn_outs = [] rnn_outs_ori = [] cells = [] projs = [] num_layers = 2 for i in range(num_layers): if init_hidden and init_cell: h0 = layers.squeeze(layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) c0 = layers.squeeze(layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) else: h0 = c0 = None rnn_out, cell, input_proj = lstmp_encoder( rnn_input, hidden_size, h0, c0, para_name + 'layer{}'.format(i + 1), emb_size, args) rnn_out_ori = rnn_out if i > 0: rnn_out = rnn_out + rnn_input rnn_out.stop_gradient = True rnn_outs.append(rnn_out) rnn_outs_ori.append(rnn_out_ori) return rnn_outs, rnn_outs_ori
def forward(self, x): """Compute Conv1DTranspose by unsqueeze the input and squeeze the output. Args: x (Variable): shape(B, C_in, T_in), dtype float32, input of Conv1DTranspose. Returns: Variable: shape(B, C_out, T_out), dtype float32, output of Conv1DTranspose. """ x = F.unsqueeze(x, [2]) x = super(Conv1DTranspose, self).forward(x) # maybe risky here x = F.squeeze(x, [2]) return x
def expand(self, batch, predicted, alpha): out = [] time_steps = batch.shape[1] fertilities = predicted.numpy() batch = layers.squeeze(batch, [0]) for i in range(time_steps): if fertilities[0, i] == 0: continue out.append( layers.expand(batch[i:i + 1, :], [int(fertilities[0, i]), 1])) out = layers.concat(out, axis=0) return out
def siamLSTM(tok_ids1, tok_ids2, len1, len2, conf): emb = fluid.ParamAttr('embedding', initializer=fluid.initializer.UniformInitializer( -0.1, 0.1)) emb1 = layers.embedding(tok_ids1, size=[conf['vocab_size'], conf['hidden_size']], dtype='float32', is_sparse=False, param_attr=emb) emb2 = layers.embedding(tok_ids2, size=[conf['vocab_size'], conf['hidden_size']], dtype='float32', is_sparse=False, param_attr=emb) w = fluid.ParamAttr('lstm_w') b = fluid.ParamAttr('lstm_b') _, enc_out1, _ = fluid.contrib.layers.basic_lstm(emb1, None, None, conf['hidden_size'], sequence_length=len1, param_attr=w, bias_attr=b) _, enc_out2, _ = fluid.contrib.layers.basic_lstm(emb2, None, None, conf['hidden_size'], sequence_length=len2, param_attr=w, bias_attr=b) enc_out1 = layers.squeeze(enc_out1, [0]) enc_out2 = layers.squeeze(enc_out2, [0]) sim = layers.fc(enc_out1 * enc_out2, 2) return sim
def forward(self): """Build the skipgram model. """ initrange = 1.0 / self.config['embed_dim'] embed_init = fluid.initializer.UniformInitializer(low=-initrange, high=initrange) weight_init = fluid.initializer.TruncatedNormal( scale=1.0 / math.sqrt(self.config['embed_dim'])) embed_src = fl.embedding( input=self.train_inputs, size=[self.num_nodes, self.config['embed_dim']], param_attr=fluid.ParamAttr(name='content', initializer=embed_init)) weight_pos = fl.embedding( input=self.train_labels, size=[self.num_nodes, self.config['embed_dim']], param_attr=fluid.ParamAttr(name='weight', initializer=weight_init)) weight_negs = fl.embedding( input=self.train_negs, size=[self.num_nodes, self.config['embed_dim']], param_attr=fluid.ParamAttr(name='weight', initializer=weight_init)) pos_logits = fl.matmul(embed_src, weight_pos, transpose_y=True) # [batch_size, 1, 1] pos_score = fl.squeeze(pos_logits, axes=[1]) pos_score = fl.clip(pos_score, min=-10, max=10) pos_score = -self.neg_num * fl.logsigmoid(pos_score) neg_logits = fl.matmul(embed_src, weight_negs, transpose_y=True) # [batch_size, 1, neg_num] neg_score = fl.squeeze(neg_logits, axes=[1]) neg_score = fl.clip(neg_score, min=-10, max=10) neg_score = -1.0 * fl.logsigmoid(-1.0 * neg_score) neg_score = fl.reduce_sum(neg_score, dim=1, keep_dim=True) self.loss = fl.reduce_mean(pos_score + neg_score) / self.neg_num / 2
def sag_pool(gw, feature, ratio, graph_id, dataset, name, activation=L.tanh): """Implementation of self-attention graph pooling (SAGPool) This is an implementation of the paper SELF-ATTENTION GRAPH POOLING (https://arxiv.org/pdf/1904.08082.pdf) Args: gw: Graph wrapper object. feature: A tensor with shape (num_nodes, feature_size). ratio: The pooling ratio of nodes we want to select. graph_id: The graphs that the nodes belong to. dataset: To differentiate FRANKENSTEIN dataset and other datasets. name: The name of SAGPool layer. activation: The activation function. Return: new_feature: A tensor with shape (num_nodes, feature_size), and the unselected nodes' feature is masked by zero. ratio_length: The selected node numbers of each graph. """ if dataset == "FRANKENSTEIN": gcn_ = gcn else: gcn_ = norm_gcn score = gcn_(gw=gw, feature=feature, hidden_size=1, activation=None, norm=gw.node_feat["norm"], name=name) score = L.squeeze(score, axes=[]) perm, ratio_length = topk_pool(gw, score, graph_id, ratio) mask = L.zeros_like(score) mask = L.cast(mask, dtype="float32") updates = L.ones_like(perm) updates = L.cast(updates, dtype="float32") mask = L.scatter(mask, perm, updates) new_feature = L.elementwise_mul(feature, mask, axis=0) temp_score = activation(score) new_feature = L.elementwise_mul(new_feature, temp_score, axis=0) return new_feature, ratio_length
def create_rnn_op(self): x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim], dtype='float32', name='x', append_batch_size=False) x.stop_gradient = False emb = layers.data( name='emb', shape=[self.sent_len, self.batch_size, self.input_dim], dtype='float32', append_batch_size=False) emb.stop_gradient = False w1 = layers.data(shape=[self.input_dim, self.input_dim], dtype='float32', name='w1', append_batch_size=False) w1.stop_gradient = False w2 = layers.data(shape=[self.input_dim * 2, self.input_dim], dtype='float32', name='w2', append_batch_size=False) w2.stop_gradient = False rnn = layers.StaticRNN() def dot_attention(query, memory): attn = layers.matmul(query, memory, transpose_y=True) weight = layers.softmax(attn) weight_memory = layers.matmul(weight, memory) return weight_memory, weight y = layers.matmul(emb, w1) with rnn.step(): pre_h = rnn.memory(shape=(self.sent_len, self.input_dim), batch_ref=x, init_value=0.0) step_in = rnn.step_input(x) concat_in = layers.concat([step_in, pre_h], 1) new_h = layers.matmul(concat_in, w2) new_h = layers.unsqueeze(new_h, [1]) new_h, _ = dot_attention(new_h, y) new_h = layers.squeeze(new_h, [1]) rnn.update_memory(pre_h, new_h) rnn.step_output(new_h) return rnn()
def forward(self, x): """Compute the upsampled condition. Args: x (Variable): shape(B, F, T), dtype float32, the condition (mel spectrogram here.) (F means the frequency bands). In the internal Conv2DTransposes, the frequency dimension is treated as `height` dimension instead of `in_channels`. Returns: Variable: shape(B, F, T * upscale_factor), dtype float32, the upsampled condition. """ x = F.unsqueeze(x, axes=[1]) for sublayer in self.upsample_convs: x = F.leaky_relu(sublayer(x), alpha=.4) x = F.squeeze(x, [1]) return x
def forward(self, hidden, encoder_output, encoder_padding_mask): # query = self.input_proj(hidden) encoder_output = self.input_proj(encoder_output) attn_scores = layers.matmul( layers.unsqueeze(hidden, [1]), encoder_output, transpose_y=True) if encoder_padding_mask is not None: attn_scores = layers.elementwise_add(attn_scores, encoder_padding_mask) attn_scores = layers.softmax(attn_scores) attn_out = layers.squeeze( layers.matmul(attn_scores, encoder_output), [1]) attn_out = layers.concat([attn_out, hidden], 1) attn_out = self.output_proj(attn_out) return attn_out
def pop(cls, stack_data, mask=True, in_place=True): """pop data in stack_data Args: stack_data (StackData): (data, pos) with shape ([batch_size, stack_len], [batch_size, 1]) mask (bool): 是否 mask 空栈的返回值。默认为 True in_place (bool): 默认为 True Returns: (Variable1, Variable2) Variable1: pop 得到的值 dtype=stack_data.data.dtype shape=[-1] Variable2: 对应位置的值是否合法。入参已经为空的栈,此处为 False。 dtype=bool shape=[-1] Raises: NULL """ data = stack_data.data pos = stack_data.pos # 只有非空的栈才能pop(才合法) valid_pos = layers.logical_not(cls.empty(stack_data)) new_pos_delta = layers.cast(valid_pos, dtype=pos.dtype) new_pos = layers.elementwise_sub(pos, new_pos_delta) # shape = [batch_size] output = nn_utils.batch_gather(data, new_pos) # mask 空栈的返回值 if mask: # shape = [batch_size, 1] mask_tag = layers.cast( new_pos_delta, dtype=data.dtype) if data.dtype != pos.dtype else new_pos_delta mask_tag = layers.squeeze(mask_tag, [1]) output = layers.elementwise_mul(output, mask_tag) # 出栈后原位置置为0 updates = layers.zeros_like(output) new_data = nn_utils.batch_scatter(data, new_pos, updates, overwrite=True, in_place=in_place) if in_place: layers.assign(new_pos, pos) return output, valid_pos, stack_data else: return output, valid_pos, StackData(new_data, new_pos)
def decode(args, s_arc, s_rel, mask): """Decode function""" mask = mask.numpy() lens = np.sum(mask, -1) # prevent self-loops arc_preds = layers.argmax(s_arc, -1).numpy() bad = [not utils.istree(seq[:i + 1]) for i, seq in zip(lens, arc_preds)] if args.tree and any(bad): arc_preds[bad] = utils.eisner(s_arc.numpy()[bad], mask[bad]) arc_preds = dygraph.to_variable(arc_preds, zero_copy=False) rel_preds = layers.argmax(s_rel, axis=-1) # batch_size, seq_len, _ = rel_preds.shape rel_preds = nn.index_sample(rel_preds, layers.unsqueeze(arc_preds, -1)) rel_preds = layers.squeeze(rel_preds, axes=[-1]) return arc_preds, rel_preds