def forward(self, x, speaker_embed=None): """ Convert mel spectrogram or decoder hidden states to linear spectrogram. Args: x (Variable): Shape(B, T_mel, C_in), dtype float32, converter inputs, where C_in means the input channel for the converter. Note that it can be either C_mel (channel of mel spectrogram) or C_dec // r. When use mel_spectrogram as the input of converter, C_in = C_mel; and when use decoder states as the input of converter, C_in = C_dec // r. speaker_embed (Variable, optional): shape(B, C_sp), dtype float32, speaker embedding, where C_sp means the speaker embedding size. Returns: out (Variable): Shape(B, T_lin, C_lin), the output linear spectrogram, where C_lin means the channel of linear spectrogram and T_linear means the length(time steps) of linear spectrogram. T_line = time_upsampling * T_mel, which depends on the time_upsampling of the converter. """ x = F.transpose(x, [0, 2, 1]) x = self.first_conv_proj(x) if speaker_embed is not None: speaker_embed = F.dropout( speaker_embed, self.dropout, dropout_implementation="upscale_in_train") for layer in chain(self.upsampling_convolutions, self.convolutions): if isinstance(layer, Conv1DGLU): x = layer(x, speaker_embed) else: x = layer(x) out = self.last_conv_proj(x) out = F.transpose(out, [0, 2, 1]) return out
def forward(self, encoder_output): """ Predict the duration of each character. Args: encoder_output (Variable): shape(B, T, C), dtype float32, the encoder output. Returns: out (Variable): shape(B, T, C), the output of duration predictor. """ # encoder_output.shape(N, T, C) out = layers.transpose(encoder_output, [0, 2, 1]) out = self.conv1(out) out = layers.transpose(out, [0, 2, 1]) out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout, dropout_implementation='upscale_in_train') out = layers.transpose(out, [0, 2, 1]) out = self.conv2(out) out = layers.transpose(out, [0, 2, 1]) out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout, dropout_implementation='upscale_in_train') out = layers.relu(self.linear(out)) out = layers.squeeze(out, axes=[-1]) return out
def forward(self, input): x = self.model(input) gap = adaptive_pool2d(x, 1, pool_type='avg') gap_logit = self.gap_fc(reshape(gap, shape=[x.shape[0], -1])) gap_weight = list(self.gap_fc.parameters())[0] gap_weight = transpose(gap_weight, perm=[1, 0]) gap = x * unsqueeze(unsqueeze(gap_weight, 2), 3) gmp = adaptive_pool2d(x, 1, pool_type='max') gmp_logit = self.gmp_fc(reshape(gmp, shape=[x.shape[0], -1])) gmp_weight = list(self.gmp_fc.parameters())[0] gmp_weight = transpose(gmp_weight, perm=[1, 0]) gmp = x * unsqueeze(unsqueeze(gmp_weight, 2), 3) cam_logit = concat([gap_logit, gmp_logit], 1) x = concat([gap, gmp], 1) x = self.leaky_relu(self.conv1x1(x)) heatmap = reduce_sum(x, dim=1, keep_dim=True) x = self.pad(x) out = self.conv(x) return out, cam_logit, heatmap
def forward(self, input): """ Compute feed forward network result. Args: input (Variable): shape(B, T, C), dtype float32, the input value. Returns: output (Variable): shape(B, T, C), the result after FFN. """ x = layers.transpose(input, [0, 2, 1]) #FFN Networt x = self.w_2(layers.relu(self.w_1(x))) # dropout x = layers.dropout(x, self.dropout, dropout_implementation='upscale_in_train') x = layers.transpose(x, [0, 2, 1]) # residual connection x = x + input #layer normalization output = self.layer_norm(x) return output
def detect(self, batch_idx, conf_preds, decoded_boxes, mask_data): """ Perform nms for only the max scoring class that isn't background (class 0) """ # 确实是先坐标全部解码完成,在进行分数过滤。可以考虑过滤后再进行坐标解码 cur_scores = conf_preds[batch_idx, 1:, :] conf_scores = P.reduce_max(cur_scores, dim=0) ''' gpu版本的paddlepaddle1.6.2里有一个问题。keep如果是[None],并且在gather()里使用了keep,就会出现 cudaGetLastError invalid configuration argument errno: 9 这个错误。cpu版本则可以正常跑。 为了避免上面的问题,只能让keep不是[None],所以这里给keep额外添加了一个元素keep_extra。 ''' keep = P.where(conf_scores > self.conf_thresh) keep_extra = P.where(conf_scores < self.conf_thresh) keep_extra = keep_extra[:1] keep = P.concat([keep, keep_extra], axis=0) scores = P.gather(P.transpose(cur_scores, perm=[1, 0]), keep) scores = P.transpose(scores, perm=[1, 0]) boxes = P.gather(decoded_boxes, keep) masks = P.gather(mask_data[batch_idx], keep) ''' 因为上面增加了一个keep_extra,所以keep一定至少有一个预测框。 当官方修复了上述问题后,删除上面keep_extra的代码,下面的代码解除注释。 这么做的原因是判断keep为空太难了。 ''' # 可能没有框被保留。所以添加一个得分垫底的框让fast_nms()能进行下去 # extra_box = P.fill_constant((1, 4), 'float32', value=-1.0) # extra_score = P.fill_constant((P.shape(cur_scores)[0], 1), 'float32', value=-1.0) # extra_mask = P.fill_constant((1, P.shape(mask_data)[2]), 'float32', value=-1.0) # boxes = P.concat([boxes, extra_box], axis=0) # scores = P.concat([scores, extra_score], axis=1) # masks = P.concat([masks, extra_mask], axis=0) return self.fast_nms(boxes, scores, masks)
def compute_l2_normalized_weight(v, g, dim): shape = v.shape ndim = len(shape) if dim is None: v_normalized = v / (F.reduce_sum(F.square(v)) + 1e-12) elif dim == 0: param_matrix = F.reshape(v, (shape[0], np.prod(shape[1:]))) v_normalized = F.l2_normalize(param_matrix, axis=1) elif dim == -1 or dim == ndim - 1: param_matrix = F.reshape(v, (np.prod(shape[:-1]), shape[-1])) v_normalized = F.l2_normalize(param_matrix, axis=0) else: perm = list(range(ndim)) perm[0] = dim perm[dim] = 0 transposed_param = F.transpose(v, perm) param_matrix = F.reshape( transposed_param, (transposed_param.shape[0], np.prod(transposed_param.shape[1:]))) v_normalized = F.l2_normalize(param_matrix, axis=1) v_normalized = F.transpose(v_normalized, perm) v_normalized = F.reshape(v_normalized, shape) weight = F.elementwise_mul(v_normalized, g, axis=dim) return weight
def forward(self, queries, keys, values, attn_bias, cache=None): # compute q ,k ,v keys = queries if keys is None else keys values = keys if values is None else values q = self.q_fc(queries) k = self.k_fc(keys) v = self.v_fc(values) # split head q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) q = layers.transpose(x=q, perm=[0, 2, 1, 3]) k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) k = layers.transpose(x=k, perm=[0, 2, 1, 3]) v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) v = layers.transpose(x=v, perm=[0, 2, 1, 3]) if cache is not None: cache_k, cache_v = cache["k"], cache["v"] k = layers.concat([cache_k, k], axis=2) v = layers.concat([cache_v, v], axis=2) cache["k"], cache["v"] = k, v # scale dot product attention product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5) if attn_bias is not None: product += attn_bias weights = layers.softmax(product) if self.dropout_rate: weights = layers.dropout(weights, dropout_prob=self.dropout_rate) out = layers.matmul(weights, v) out = layers.transpose(out, perm=[0, 2, 1, 3]) out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) out = self.proj_fc(out) return out
def forward(self, input): """ Compute the mel spectrum. Args: input (Variable): shape(B, T, C), dtype float32, the result of mel linear projection. Returns: output (Variable): shape(B, T, C), the result after postconvnet. """ input = layers.transpose(input, [0, 2, 1]) len = input.shape[-1] for i in range(self.num_conv - 1): batch_norm = self.batch_norm_list[i] conv = self.conv_list[i] input = layers.dropout(layers.tanh( batch_norm(conv(input)[:, :, :len])), self.dropout, dropout_implementation='upscale_in_train') conv = self.conv_list[self.num_conv - 1] input = conv(input)[:, :, :len] if self.batchnorm_last: batch_norm = self.batch_norm_list[self.num_conv - 1] input = layers.dropout(batch_norm(input), self.dropout, dropout_implementation='upscale_in_train') output = layers.transpose(input, [0, 2, 1]) return output
def _weight_norm(v, g, dim): shape = v.shape ndims = len(shape) if dim is None: v_normalized = v / (F.sqrt(F.reduce_sum(F.square(v))) + 1e-12) elif dim == 0: p_matrix = F.reshape(v, (shape[0], -1)) v_normalized = F.l2_normalize(p_matrix, axis=1) v_normalized = F.reshape(v_normalized, shape) elif dim == -1 or dim == ndims - 1: p_matrix = F.reshape(v, (-1, shape[-1])) v_normalized = F.l2_normalize(p_matrix, axis=0) v_normalized = F.reshape(v_normalized, shape) else: perm = list(range(ndims)) perm[0] = dim perm[dim] = 0 p_transposed = F.transpose(v, perm) transposed_shape = p_transposed.shape p_matrix = F.reshape(p_transposed, (p_transposed.shape[0], -1)) v_normalized = F.l2_normalize(p_matrix, axis=1) v_normalized = F.reshape(v_normalized, transposed_shape) v_normalized = F.transpose(v_normalized, perm) weight = F.elementwise_mul(v_normalized, g, axis=dim if dim is not None else -1) return weight
def forward(self, x, condition=None): """compute the output distribution (represented by its parameters). Args: x (Variable): shape(B, T), dtype float32, the input waveform. condition (Variable, optional): shape(B, C_cond, T), dtype float32, the upsampled condition. Defaults to None. Returns: Variable: shape(B, T, C_output), dtype float32, the parameter of the output distributions. """ # Causal Conv if self.loss_type == "softmax": x = F.clip(x, min=-1., max=0.99999) x = quantize(x, self.output_dim) x = self.embed(x) # (B, T, C) else: x = F.unsqueeze(x, axes=[-1]) # (B, T, 1) x = self.embed(x) # (B, T, C) x = F.transpose(x, perm=[0, 2, 1]) # (B, C, T) # Residual & Skip-conenection & linears z = self.resnet(x, condition) z = F.transpose(z, [0, 2, 1]) z = F.relu(self.proj2(F.relu(self.proj1(z)))) y = self.proj3(z) return y
def forward(self, seq): seq = layers.transpose(seq, [0, 2, 1]) seq = layers.unsqueeze(seq, -1) seq = self.conv2d(seq) seq = layers.squeeze(seq, [-1]) seq = layers.transpose(seq, [0, 2, 1]) return seq
def forward(self, queries, keys, values, attn_bias, past_cache): assert len(queries.shape) == len(keys.shape) == len(values.shape) == 3 #bsz, q_len, q_dim = queries.shape #bsz, k_len, k_dim = keys.shape #bsz, v_len, v_dim = values.shape #assert k_len == v_len q = self.q(queries) k = self.k(keys) v = self.v(values) cache = (k, v) if past_cache is not None: cached_k, cached_v = past_cache k = L.concat([cached_k, k], 1) v = L.concat([cached_v, v], 1) q = L.transpose(L.reshape(q, [0, 0, self.n_head, q.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim] k = L.transpose(L.reshape(k, [0, 0, self.n_head, k.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim] v = L.transpose(L.reshape(v, [0, 0, self.n_head, v.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim] q = L.scale(q, scale=self.d_key ** -0.5) score = L.matmul(q, k, transpose_y=True) if attn_bias is not None: score += attn_bias score = L.softmax(score, use_cudnn=True) score = self.dropout(score) out = L.matmul(score, v) out = L.transpose(out, [0, 2, 1, 3]) out = L.reshape(out, [0, 0, out.shape[2] * out.shape[3]]) out = self.o(out) return out, cache
def add_input(self, x, condition=None): """compute the output distribution (represented by its parameters) for a step. It works similarily with the `forward` method but in a `step-in-step-out` fashion. Args: x (Variable): shape(B, T=1), dtype float32, a step of the input waveform. condition (Variable, optional): shape(B, C_cond, T=1), dtype float32, a step of the upsampled condition. Defaults to None. Returns: Variable: shape(B, T=1, C_output), dtype float32, the parameter of the output distributions. """ # Causal Conv if self.loss_type == "softmax": x = F.clip(x, min=-1., max=0.99999) x = quantize(x, self.output_dim) x = self.embed(x) # (B, T, C), T=1 else: x = F.unsqueeze(x, axes=[-1]) # (B, T, 1), T=1 x = self.embed(x) # (B, T, C) x = F.transpose(x, perm=[0, 2, 1]) # Residual & Skip-conenection & linears z = self.resnet.add_input(x, condition) z = F.transpose(z, [0, 2, 1]) z = F.relu(self.proj2(F.relu(self.proj1(z)))) # (B, T, C) # Output y = self.proj3(z) return y
def _prepare_qkv(self, queries, keys, values, cache=None): if keys is None: # self-attention keys, values = queries, queries static_kv = False else: # cross-attention static_kv = True q = self.q_fc(queries) q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) q = layers.transpose(x=q, perm=[0, 2, 1, 3]) if cache is not None and static_kv and "static_k" in cache: # for encoder-decoder attention in inference and has cached k = cache["static_k"] v = cache["static_v"] else: k = self.k_fc(keys) v = self.v_fc(values) k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) k = layers.transpose(x=k, perm=[0, 2, 1, 3]) v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) v = layers.transpose(x=v, perm=[0, 2, 1, 3]) if cache is not None: if static_kv and not "static_k" in cache: # for encoder-decoder attention in inference and has not cached cache["static_k"], cache["static_v"] = k, v elif not static_kv: # for decoder self-attention in inference cache_k, cache_v = cache["k"], cache["v"] k = layers.concat([cache_k, k], axis=2) v = layers.concat([cache_v, v], axis=2) cache["k"], cache["v"] = k, v return q, k, v
def forward(self, src, mask, query_embed, pos_embed): # flatten NxCxHxW to HWxNxC bs, c, h, w = src.shape src = L.reshape(src, (bs, c, -1)) # [bs, c, h * w] src = L.transpose(src, (0, 2, 1)) # [bs, h * w, c] pos_embed = L.reshape(pos_embed, (bs, pos_embed.shape[1], -1)) # [bs, c, h * w] pos_embed = L.transpose(pos_embed, (0, 2, 1)) # [bs, h * w, c] query_embed = L.unsqueeze(query_embed, [0]) # [1, num_queries, c_q] query_embed = L.expand(query_embed, (bs, 1, 1)) # [bs, num_queries, c_q] mask = L.reshape(mask, (bs, -1)) # [bs, h * w] tgt = L.zeros_like(query_embed) # [bs, num_queries, c_q] memory, encoder_attn_weights = self.encoder( src, src_mask=mask, pos=pos_embed) # [bs, h * w, c] hs, decoder_attn_weights = self.decoder(tgt, memory, memory_mask=mask, pos=pos_embed, query_pos=query_embed) # hs: [num_inter, bs, num_queries, c_q] memory = L.transpose(memory, (0, 2, 1)) # [bs, c, h * w] memory = L.reshape(memory, (bs, c, h, w)) # [bs, c, h, w] return hs, memory, encoder_attn_weights, decoder_attn_weights
def _matrix_nms(bboxes, cate_labels, cate_scores, kernel='gaussian', sigma=2.0): """Matrix NMS for multi-class bboxes. Args: bboxes (Tensor): shape (n, 4) cate_labels (Tensor): shape (n), mask labels in descending order cate_scores (Tensor): shape (n), mask scores in descending order kernel (str): 'linear' or 'gaussian' sigma (float): std in gaussian method Returns: Tensor: cate_scores_update, tensors of shape (n) """ n_samples = len(cate_labels) if n_samples == 0: return [] # 计算一个n×n的IOU矩阵,两组矩形两两之间的IOU iou_matrix = jaccard(bboxes, bboxes) # shape: [n_samples, n_samples] iou_matrix = paddle.triu(iou_matrix, diagonal=1) # 只取上三角部分 # label_specific matrix. cate_labels_x = L.expand(L.reshape(cate_labels, (1, -1)), [n_samples, 1]) # shape: [n_samples, n_samples] # 第i行第j列表示的是第i个预测框和第j个预测框的类别id是否相同。我们抑制的是同类的预测框。 d = cate_labels_x - L.transpose(cate_labels_x, [1, 0]) d = L.pow(d, 2) # 同类处为0,非同类处>0。 tf中用 == 0比较无效,所以用 < 1 label_matrix = paddle.triu(L.cast(d < 1, 'float32'), diagonal=1) # shape: [n_samples, n_samples] # IoU compensation # 非同类的iou置为0,同类的iou保留。逐列取最大iou compensate_iou = L.reduce_max(iou_matrix * label_matrix, [0, ]) # shape: [n_samples, ] # compensate_iou第0行里的值a0(重复了n_samples次)表示第0个物体与 比它分高 的 同类物体的最高iou为a0, # compensate_iou第1行里的值a1(重复了n_samples次)表示第1个物体与 比它分高 的 同类物体的最高iou为a1,... # compensate_iou里每一列里的值依次代表第0个物体、第1个物体、...、第n_samples-1个物体与 比它自己分高 的 同类物体的最高iou。 compensate_iou = L.transpose(L.expand(L.reshape(compensate_iou, (1, -1)), [n_samples, 1]), [1, 0]) # shape: [n_samples, n_samples] # IoU decay # 非同类的iou置为0,同类的iou保留。 # decay_iou第i行第j列表示的是第i个预测框和第j个预测框的iou,如果不是同类,该iou置0。且只取上三角部分。 decay_iou = iou_matrix * label_matrix # shape: [n_samples, n_samples] # matrix nms if kernel == 'gaussian': decay_matrix = L.exp(-1 * sigma * (decay_iou ** 2)) compensate_matrix = L.exp(-1 * sigma * (compensate_iou ** 2)) decay_coefficient = L.reduce_sum(decay_matrix / compensate_matrix, [0, ]) elif kernel == 'linear': # 看第j列。(1_test_matrixnms.py里的例子,看第2列) # decay_iou 里第2列里的值为[0.9389, 0.9979, 0, 0]。第2个物体与比它分高的2个同类物体的iou是0.9389, 0.9979。 # compensate_iou里第2列里的值为[0, 0.9409, 0.9979, 0]。比第2个物体分高的2个同类物体 与 比它们自己分高 的 同类物体的最高iou 是0, 0.9409。 # decay_matrix 里第2列里的值为[0.0610, 0.0348, 485.28, 1]。取该列的最小值为0.0348(抑制掉第2个物体的是第1个物体)。其实后面2个值不用看,因为它们总是>=1。 # 总结:decay_matrix里第j列里的第i个值若为最小值,则抑制掉第j个物体的是第i个物体。 # 而且,表现为decay_iou尽可能大,decay_matrix才会尽可能小。 decay_matrix = (1-decay_iou)/(1-compensate_iou) decay_coefficient = L.reduce_min(decay_matrix, [0, ]) else: raise NotImplementedError # 更新分数 cate_scores_update = cate_scores * decay_coefficient return cate_scores_update
def cal_kv(self, keys, values): k = self.k_fc(keys) v = self.v_fc(values) k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) k = layers.transpose(x=k, perm=[0, 2, 1, 3]) v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) v = layers.transpose(x=v, perm=[0, 2, 1, 3]) return k, v
def forward(self, key, value, query_input, mask=None, query_mask=None): """ Compute attention. Args: key (Variable): shape(B, T, C), dtype float32, the input key of attention. value (Variable): shape(B, T, C), dtype float32, the input value of attention. query_input (Variable): shape(B, T, C), dtype float32, the input query of attention. mask (Variable, optional): shape(B, T_query, T_key), dtype float32, the mask of key. Defaults to None. query_mask (Variable, optional): shape(B, T_query, T_key), dtype float32, the mask of query. Defaults to None. Returns: result (Variable): shape(B, T, C), the result of mutihead attention. attention (Variable): shape(num_head * B, T, C), the attention of key and query. """ batch_size = key.shape[0] seq_len_key = key.shape[1] seq_len_query = query_input.shape[1] # Make multihead attention key = layers.reshape( self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k]) value = layers.reshape( self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k]) query = layers.reshape( self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q]) key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k]) query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q]) result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask) # concat all multihead result result = layers.reshape( result, [self.num_head, batch_size, seq_len_query, self.d_q]) result = layers.reshape(layers.transpose(result, [1, 2, 0, 3]), [batch_size, seq_len_query, -1]) if self.is_concat: result = layers.concat([query_input, result], axis=-1) result = layers.dropout(self.fc(result), self.dropout, dropout_implementation='upscale_in_train') result = result + query_input result = self.layer_norm(result) return result, attention
def simple_rnn(rnn_input, init_hidden, hidden_size, kernel_param_attr=None, recurrent_param_attr=None, bias_attr=None, act='relu', sequence_length=None, name='simple_rnn'): # Transpose (sequence x batch x hidden) rnn_input = layers.transpose(rnn_input, [1, 0, 2]) # Generate Mask mask = None if sequence_length: max_seq_len = layers.shape(rnn_input)[0] mask = layers.sequence_mask(sequence_length, maxlen=max_seq_len, dtype='float32') mask = layers.transpose(mask, [1, 0]) # Init simple_rnn = SimpleRNN_unit(rnn_input, hidden_size, kernel_param_attr, recurrent_param_attr, bias_attr, act) rnn = PaddingRNN() with rnn.step(): step_in = rnn.step_input(rnn_input) if mask: step_mask = rnn.step_input(mask) if init_hidden: pre_hidden = rnn.memory(init=init_hidden) else: pre_hidden = rnn.memory(batch_ref=rnn_input, shape=[-1, hidden_size]) last_hidden = simple_rnn(step_in, pre_hidden) rnn.update_memory(pre_hidden, last_hidden) rnn.step_output(last_hidden) step_input = last_hidden rnn_out = rnn() last_hidden = rnn_out[-1] last_hidden = layers.reshape(last_hidden, shape=[1, -1, hidden_size]) rnn_output = layers.transpose(rnn_out, [1, 0, 2]) last_hidden = layers.transpose(last_hidden, [1, 0, 2]) return rnn_out, last_hidden
def matrix_nms(seg_masks, cate_labels, cate_scores, kernel='gaussian', sigma=2.0, sum_masks=None): """Matrix NMS for multi-class masks. Args: seg_masks (Tensor): shape (n, h, w) 0、1组成的掩码 cate_labels (Tensor): shape (n), mask labels in descending order cate_scores (Tensor): shape (n), mask scores in descending order kernel (str): 'linear' or 'gauss' sigma (float): std in gaussian method sum_masks (Tensor): shape (n, ) n个物体的面积 Returns: Tensor: cate_scores_update, tensors of shape (n) """ n_samples = L.shape(cate_labels)[0] # 物体数 seg_masks = L.reshape(seg_masks, (n_samples, -1)) # [n, h*w] # inter. inter_matrix = L.matmul(seg_masks, seg_masks, transpose_y=True) # [n, n] 自己乘以自己的转置。两两之间的交集面积。 # union. sum_masks_x = L.expand(L.reshape(sum_masks, (1, -1)), [n_samples, 1]) # [n, n] sum_masks重复了n行得到sum_masks_x # iou. iou_matrix = inter_matrix / (sum_masks_x + L.transpose(sum_masks_x, [1, 0]) - inter_matrix) rows = L.range(0, n_samples, 1, 'int32') cols = L.range(0, n_samples, 1, 'int32') rows = L.expand(L.reshape(rows, (1, -1)), [n_samples, 1]) cols = L.expand(L.reshape(cols, (-1, 1)), [1, n_samples]) tri_mask = L.cast(rows > cols, 'float32') iou_matrix = tri_mask * iou_matrix # [n, n] 只取上三角部分 # label_specific matrix. cate_labels_x = L.expand(L.reshape(cate_labels, (1, -1)), [n_samples, 1]) # [n, n] cate_labels重复了n行得到cate_labels_x label_matrix = L.cast(L.equal(cate_labels_x, L.transpose(cate_labels_x, [1, 0])), 'float32') label_matrix = tri_mask * label_matrix # [n, n] 只取上三角部分 # IoU compensation compensate_iou = L.reduce_max(iou_matrix * label_matrix, dim=0) compensate_iou = L.expand(L.reshape(compensate_iou, (1, -1)), [n_samples, 1]) # [n, n] compensate_iou = L.transpose(compensate_iou, [1, 0]) # [n, n] # IoU decay decay_iou = iou_matrix * label_matrix # # matrix nms if kernel == 'gaussian': decay_matrix = L.exp(-1 * sigma * (decay_iou ** 2)) compensate_matrix = L.exp(-1 * sigma * (compensate_iou ** 2)) decay_coefficient = L.reduce_min((decay_matrix / compensate_matrix), dim=0) elif kernel == 'linear': decay_matrix = (1-decay_iou)/(1-compensate_iou) decay_coefficient = L.reduce_min(decay_matrix, dim=0) else: raise NotImplementedError # update the score. cate_scores_update = cate_scores * decay_coefficient return cate_scores_update
def rnn_decoder(gru_unit, cue_gru_unit, input, input_size, hidden_size, num_layers, memory, memory_mask, knowledge, output_size, init_hidden=None, mask=None, dropout=0.0, batch_first=True, name="decoder"): """ rnn decoder """ input_emb = get_embedding(input, input_size, output_size) if batch_first: input_emb = layers.transpose(input_emb, perm=[1, 0, 2]) if mask: trans_mask = layers.transpose(mask, perm=[1, 0]) rnn = PaddingRNN() with rnn.step(): step_in = rnn.step_input(input_emb) step_mask = None if mask: step_mask = rnn.step_input(trans_mask) # split pre_hidden pre_hidden_list = [] pre_hidden = rnn.memory(init=init_hidden) real_out, last_hidden = \ decoder_step(gru_unit, cue_gru_unit, step_in, pre_hidden, input_size, hidden_size, memory, memory_mask, knowledge, mask=step_mask) rnn.update_memory(pre_hidden, last_hidden) step_in = layers.squeeze(real_out, axes=[1]) rnn.step_output(step_in) rnnout = rnn() rnnout = layers.transpose(rnnout, perm=[1, 0, 2]) rnnout = layers.elementwise_mul(rnnout, mask, axis=0) output_in_size = hidden_size + hidden_size rnnout = layers.dropout(rnnout, dropout_prob=dropout) rnnout = fc(rnnout, output_in_size, hidden_size, name='dec_out_fc1') rnnout = fc(rnnout, hidden_size, output_size, name='dec_out_fc2') softmax_out = layers.softmax(rnnout) return softmax_out
def dot_attention(query, memory, mask=None): attn = layers.matmul(query, memory, transpose_y=True) if mask: attn = layers.transpose(attn, [1, 0, 2]) attn = layers.elementwise_add(attn, mask * 1000000000, -1) attn = layers.transpose(attn, [1, 0, 2]) weight = layers.softmax(attn) weight_memory = layers.matmul(weight, memory) return weight_memory, weight
def forward(self, input): x = self.DownBlock(input) gap = adaptive_pool2d(x, pool_size=[1, 1], pool_type='avg') gap_ = reshape(x=gap, shape=(x.shape[0], -1)) gap_logit = self.gap_fc(gap_) gap_weight = self.gap_fc.parameters()[0] gap_weight = transpose(gap_weight, perm=[1, 0]) gap_weight = unsqueeze(gap_weight, axes=2) gap_weight = unsqueeze(gap_weight, axes=3) gap = x * gap_weight gmp = adaptive_pool2d(x, pool_size=[1, 1], pool_type='max') gmp_ = reshape(x=gmp, shape=(x.shape[0], -1)) gmp_logit = self.gmp_fc(gmp_) gmp_weight = self.gmp_fc.parameters()[0] gmp_weight = transpose(gmp_weight, perm=[1, 0]) gmp_weight = unsqueeze(gmp_weight, axes=2) gmp_weight = unsqueeze(gmp_weight, axes=3) gmp = x * gmp_weight cam_logit = concat(input=[gap_logit, gmp_logit], axis=1) x = concat(input=[gap, gmp], axis=1) x = self.relu(self.conv1x1(x)) heatmap = reduce_sum(x, dim=1, keep_dim=True) if self.light: x_ = adaptive_pool2d(x, pool_size=[1, 1], pool_type='avg') x_ = reshape(x=x_, shape=(x_.shape[0], -1)) x_ = self.FC(x_) else: x_ = reshape(x, shape=(x.shape[0], -1)) x_ = self.FC(x_) gamma, beta = self.gamma(x_), self.beta(x_) for i in range(self.n_blocks): x = getattr(self, 'UpBlock1_' + str(i + 1))(x, gamma, beta) out = self.UpBlock2(x) return out, cam_logit, heatmap
def _relative_attention_inner(q, k, v, transpose): batch_size = layers.shape(q)[0] heads = layers.shape(q)[1] length = layers.shape(q)[2] xy_matmul = layers.matmul(q, k, transpose_y=transpose) x_t = layers.transpose(q, [2, 0, 1, 3]) x_t_r = layers.reshape(x_t, [length, batch_size * heads, -1]) x_tz_matmul = layers.matmul(x_t_r, v, transpose_y=transpose) x_tz_matmul_r = layers.reshape(x_tz_matmul, [length, batch_size, heads, -1]) x_tz_matmul_r_t = layers.transpose(x_tz_matmul_r, [1, 2, 0, 3]) return xy_matmul + x_tz_matmul_r_t
def _attn_forward(self, queries, keys, values, attn_bias, past_cache, head_mask=None): assert len(queries.shape) == len(keys.shape) == len(values.shape) == 3 q = self.q(queries) k = self.k(keys) v = self.v(values) cache = (k, v) if past_cache is not None: cached_k, cached_v = past_cache k = L.concat([cached_k, k], 1) v = L.concat([cached_v, v], 1) if hasattr(self.q, 'fn') and self.q.fn.cur_config['expand_ratio'] != None: n_head = int(self.n_head * self.q.fn.cur_config['expand_ratio']) else: n_head = self.n_head q = L.transpose( L.reshape(q, [0, 0, n_head, q.shape[-1] // n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim] k = L.transpose( L.reshape(k, [0, 0, n_head, k.shape[-1] // n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim] v = L.transpose( L.reshape(v, [0, 0, n_head, v.shape[-1] // n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim] q = L.scale(q, scale=self.d_key**-0.5) score = L.matmul(q, k, transpose_y=True) if attn_bias is not None: score += attn_bias score = L.softmax(score, use_cudnn=True) score = self.dropout(score) if head_mask is not None: score = score * head_mask out = L.matmul(score, v) out = L.transpose(out, [0, 2, 1, 3]) out = L.reshape(out, [0, 0, out.shape[2] * out.shape[3]]) out = self.o(out) return out, cache
def gru_rnn(input, input_size, hidden_size, init_hidden=None, batch_first=False, mask=None, num_layers=1, dropout=0.0, name="gru"): """ gru rnn """ gru_unit = GRU_unit(input_size, hidden_size, num_layers=num_layers, dropout=dropout, name=name + "_gru_unit") if batch_first: input = layers.transpose(x=input, perm=[1, 0, 2]) if mask: mask = layers.transpose(mask, perm=[1, 0]) rnn = PaddingRNN() with rnn.step(): step_in = rnn.step_input(input) step_mask = None if mask: step_mask = rnn.step_input(mask) pre_hidden = rnn.memory(init=init_hidden) new_hidden, last_hidden = gru_unit(step_in, pre_hidden, step_mask) rnn.update_memory(pre_hidden, last_hidden) step_in = new_hidden rnn.step_output(step_in) rnn.step_output(last_hidden) rnn_res = rnn() rnn_out = rnn_res[0] last_hidden = layers.slice(rnn_res[1], axes=[0], starts=[-1], ends=[1000000000]) last_hidden = layers.reshape(last_hidden, shape=[num_layers, -1, hidden_size]) if batch_first: rnnout = layers.transpose(x=rnn_out, perm=[1, 0, 2]) return rnnout, last_hidden
def forward(self, x): x = layers.transpose(x, perm=[0, 2, 1, 3, 4]) x = fluid.layers.pool3d(x, pool_size=(3, 1, 1), pool_type='avg', pool_stride=(2, 1, 1)) b, c, t, h, w = x.shape x = layers.transpose(x, perm=[0, 2, 1, 3, 4]) x = layers.reshape(x, shape=[b * t, c, h, w]) x = self.stem(x) #print(self.stem.weight.numpy().sum()) x = self.bn1(x) x = layers.pool2d(x, pool_size=3, pool_type='max', pool_stride=2, pool_padding=1) x = self.res2(x) x = self.res3(x) bt, c, h, w = x.shape x = layers.reshape(x, shape=[b, t, c, h, w]) x = layers.transpose(x, perm=[0, 2, 1, 3, 4]) x = fluid.layers.pool3d(x, pool_size=(3, 1, 1), pool_type='avg', pool_stride=(2, 1, 1)) b, c, t, h, w = x.shape x = layers.transpose(x, perm=[0, 2, 1, 3, 4]) res = layers.reshape(x[:, 1:-1], shape=[-1, c, h, w]) x = layers.reshape(x, shape=[b * t, c, h, w]) x = self.rep_flow(x) x = self.flow_conv(x) x = self.rep_flow2(x) x = layers.relu(res + x) x = self.res4(x) x = self.res5(x) x = self.dropout(x) x = layers.reduce_mean(x, dim=3) x = layers.reduce_mean(x, dim=2) x = layers.reshape(x, shape=[x.shape[0], -1]) x = self.classify(x) x = layers.reshape(x, shape=[b, -1, self.num_classes]) x = layers.reduce_mean(x, dim=1) return x
def PredictionModule(x, num_priors, num_classes, mask_dim, shared_conv_w, shared_conv_b, shared_bbox_w, shared_bbox_b, shared_conf_w, shared_conf_b, shared_mask_w, shared_mask_b): ''' 改编自DSSD算法中的PredictionModule,改成了3x3卷积。3个分支分别预测bbox、conf、mask系数。 x / | \ bbox conf mask ''' x = P.conv2d(x, 256, filter_size=(3, 3), stride=1, padding=1, param_attr=shared_conv_w, bias_attr=shared_conv_b) x = P.relu(x) bbox_x = x conf_x = x mask_x = x bbox = P.conv2d(bbox_x, num_priors * 4, filter_size=(3, 3), stride=1, padding=1, param_attr=shared_bbox_w, bias_attr=shared_bbox_b) bbox = P.transpose(bbox, perm=[0, 2, 3, 1]) bbox = P.reshape(bbox, (P.shape(bbox)[0], -1, 4)) conf = P.conv2d(conf_x, num_priors * num_classes, filter_size=(3, 3), stride=1, padding=1, param_attr=shared_conf_w, bias_attr=shared_conf_b) conf = P.transpose(conf, perm=[0, 2, 3, 1]) conf = P.reshape(conf, (P.shape(conf)[0], -1, num_classes)) mask = P.conv2d(mask_x, num_priors * mask_dim, filter_size=(3, 3), stride=1, padding=1, param_attr=shared_mask_w, bias_attr=shared_mask_b) mask = P.transpose(mask, perm=[0, 2, 3, 1]) mask = P.reshape(mask, (P.shape(mask)[0], -1, mask_dim)) mask = P.tanh(mask) preds = {'loc': bbox, 'conf': conf, 'mask': mask} return preds
def create_cam_op(self, predict, class_dim, heatmaps): """compute loss with tensor Args: predict: model output tensor activated by softmax class_dim: dim of multi-class vector heatmaps: 全局池化前的特征图 Returns: heatmaps: class activation map """ if self.main_arch in DenseNetModels: weights_shape = 1024 name = "fc_weights" elif self.main_arch == "xception": weights_shape = 2048 name = "fc_weights" else: raise ValueError( "Calc CAM of model arch {} is not supported.".format( self.main_arch)) fc_weights = FL.create_parameter(shape=[weights_shape, class_dim], dtype='float32', name=name) # 1024, 5 pred_idx = FL.argmax(predict, 1) # bs, 1 fc_weights = FL.transpose(fc_weights, perm=[1, 0]) # 5, 1024 fc_weights = FL.gather(fc_weights, index=pred_idx) # bs, 1024 heatmaps = heatmaps * fc_weights # bs, 1024, 16, 16 heatmaps = FL.reduce_sum(heatmaps, dim=1, keep_dim=False) return heatmaps
def synthesize(args, config, model, vocoder, sentence, monotonic_layers): print("[synthesize] {}".format(sentence)) text = en.text_to_sequence(sentence, p=1.0) text = np.expand_dims(np.array(text, dtype="int64"), 0) lengths = np.array([text.size], dtype=np.int64) text_seqs = dg.to_variable(text) text_lengths = dg.to_variable(lengths) decoder_layers = config["decoder_layers"] force_monotonic_attention = [False] * decoder_layers for i in monotonic_layers: force_monotonic_attention[i] = True with dg.no_grad(): outputs = model(text_seqs, text_lengths, speakers=None, force_monotonic_attention=force_monotonic_attention, window=(config["backward_step"], config["forward_step"])) decoded, refined, attentions = outputs if args.vocoder == "griffin-lim": wav_np = vocoder(refined.numpy()[0].T) else: wav = vocoder(F.transpose(refined, (0, 2, 1))) wav_np = wav.numpy()[0] return wav_np
def __combine_heads(x): """ Transpose and then reshape the last two dimensions of inpunt tensor x so that it becomes one dimension, which is reverse to __split_heads. """ if len(x.shape) == 3: return x if len(x.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) # FIXME(guosheng): Decouple the program desc with batch_size. return layers.reshape( x=trans_x, shape=map(int, [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
def __split_heads(x, n_head): """ Reshape the last dimension of inpunt tensor x so that it becomes two dimensions and then transpose. Specifically, input a tensor with shape [bs, max_sequence_length, n_head * hidden_dim] then output a tensor with shape [bs, n_head, max_sequence_length, hidden_dim]. """ if n_head == 1: return x hidden_size = x.shape[-1] # FIXME(guosheng): Decouple the program desc with batch_size. reshaped = layers.reshape( x=x, shape=[batch_size, -1, n_head, hidden_size // n_head]) # permuate the dimensions into: # [batch_size, n_head, max_sequence_len, hidden_size_per_head] return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])