def masks_to_boxes(masks): """ Compute the bounding boxes around the provided masks The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. Returns a [N, 4] tensors, with the boxes in xyxy format """ if np.sum(masks.shape) == 0: return dg.to_variable(np.zeros((0, 4))) h, w = masks.shape[-2:] y = dg.to_variable(np.arange(0, h, 1, dtype="float32")) x = dg.to_variable(np.arange(0, w, 1, dtype="float32")) y, x = T.meshgrid([y, x]) # [h, w] x_mask = (masks * L.unsqueeze(x, [0])) # [N, H, W] x_max = L.reduce_max(L.flatten(x_mask, axis=1), dim=-1) non_mask = dg.to_variable(~masks.numpy()) x_mask[non_mask] = 1e8 x_min = L.reduce_min(L.flatten(x_mask, axis=1), dim=-1) y_mask = (masks * L.unsqueeze(y, [0])) # [N, H, W] y_max = L.reduce_max(L.flatten(y_mask, axis=1), dim=-1) y_mask[non_mask] = 1e8 y_min = L.reduce_min(L.flatten(y_mask, axis=1), dim=-1) return L.stack([x_min, y_min, x_max, y_max], 1)
def matrix_nms(seg_masks, cate_labels, cate_scores, kernel='gaussian', sigma=2.0, sum_masks=None): """Matrix NMS for multi-class masks. Args: seg_masks (Tensor): shape (n, h, w) 0、1组成的掩码 cate_labels (Tensor): shape (n), mask labels in descending order cate_scores (Tensor): shape (n), mask scores in descending order kernel (str): 'linear' or 'gauss' sigma (float): std in gaussian method sum_masks (Tensor): shape (n, ) n个物体的面积 Returns: Tensor: cate_scores_update, tensors of shape (n) """ n_samples = L.shape(cate_labels)[0] # 物体数 seg_masks = L.reshape(seg_masks, (n_samples, -1)) # [n, h*w] # inter. inter_matrix = L.matmul(seg_masks, seg_masks, transpose_y=True) # [n, n] 自己乘以自己的转置。两两之间的交集面积。 # union. sum_masks_x = L.expand(L.reshape(sum_masks, (1, -1)), [n_samples, 1]) # [n, n] sum_masks重复了n行得到sum_masks_x # iou. iou_matrix = inter_matrix / (sum_masks_x + L.transpose(sum_masks_x, [1, 0]) - inter_matrix) rows = L.range(0, n_samples, 1, 'int32') cols = L.range(0, n_samples, 1, 'int32') rows = L.expand(L.reshape(rows, (1, -1)), [n_samples, 1]) cols = L.expand(L.reshape(cols, (-1, 1)), [1, n_samples]) tri_mask = L.cast(rows > cols, 'float32') iou_matrix = tri_mask * iou_matrix # [n, n] 只取上三角部分 # label_specific matrix. cate_labels_x = L.expand(L.reshape(cate_labels, (1, -1)), [n_samples, 1]) # [n, n] cate_labels重复了n行得到cate_labels_x label_matrix = L.cast(L.equal(cate_labels_x, L.transpose(cate_labels_x, [1, 0])), 'float32') label_matrix = tri_mask * label_matrix # [n, n] 只取上三角部分 # IoU compensation compensate_iou = L.reduce_max(iou_matrix * label_matrix, dim=0) compensate_iou = L.expand(L.reshape(compensate_iou, (1, -1)), [n_samples, 1]) # [n, n] compensate_iou = L.transpose(compensate_iou, [1, 0]) # [n, n] # IoU decay decay_iou = iou_matrix * label_matrix # # matrix nms if kernel == 'gaussian': decay_matrix = L.exp(-1 * sigma * (decay_iou ** 2)) compensate_matrix = L.exp(-1 * sigma * (compensate_iou ** 2)) decay_coefficient = L.reduce_min((decay_matrix / compensate_matrix), dim=0) elif kernel == 'linear': decay_matrix = (1-decay_iou)/(1-compensate_iou) decay_coefficient = L.reduce_min(decay_matrix, dim=0) else: raise NotImplementedError # update the score. cate_scores_update = cate_scores * decay_coefficient return cate_scores_update
def get_face_bbox_for_output(data_cfg, pose, crop_smaller=0): """ Get pixel coordinates of the face bounding box. """ if len(pose.shape) == 3: pose = L.unsqueeze(pose, [0]) elif len(pose.shape) == 5: pose = pose[-1, -1:] _, _, h, w = pose.shape use_openpose = False # 'pose_maps-densepose' not in data_cfg.input_labels if use_openpose: # Use openpose face keypoints to identify face region. raise NotImplementedError() else: # Use densepose labels. # face = T.search.nonzero(dg.to_variable((pose[:, 2] > 0.9).numpy().astype("int64")), as_tuple=False) face = T.search.nonzero((pose[:, 2] > 0.9).astype("int64"), as_tuple=False) ylen = xlen = h // 32 * 8 if face.shape[0]: y, x = face[:, 1], face[:, 2] ys, ye = L.reduce_min(y), L.reduce_max(y) xs, xe = L.reduce_min(x), L.reduce_max(x) if use_openpose: xc, yc = (xs + xe) // 2, (ys * 3 + ye * 2) // 5 ylen = int((xe - xs) * 2.5) else: xc, yc = (xs + xe) // 2, (ys + ye) // 2 ylen = int((ye - ys) * 1.25) ylen = xlen = min(w, max(32, ylen)) yc = max(ylen // 2, min(h - 1 - ylen // 2, yc)) xc = max(xlen // 2, min(w - 1 - xlen // 2, xc)) else: yc = h // 4 xc = w // 2 ys, ye = yc - ylen // 2, yc + ylen // 2 xs, xe = xc - xlen // 2, xc + xlen // 2 if crop_smaller != 0: # Crop slightly smaller inside face. ys += crop_smaller xs += crop_smaller ye -= crop_smaller xe -= crop_smaller if not isinstance(ys, int): ys = int(ys.numpy()[0]) if not isinstance(ye, int): ye = int(ye.numpy()[0]) if not isinstance(xs, int): xs = int(xs.numpy()[0]) if not isinstance(xe, int): xe = int(xe.numpy()[0]) return [ys, ye, xs, xe]
def is_finished(self, step_idx, source_length, alive_log_probs, finished_scores, finished_in_finished): """ is_finished """ base_1 = layers.cast(source_length, 'float32') + 55.0 base_1 /= 6.0 max_length_penalty = layers.pow(base_1, self.alpha) flat_alive_log_probs = layers.reshape(alive_log_probs, [-1]) lower_bound_alive_scores_1 = layers.gather(flat_alive_log_probs, [self.get_alive_index]) lower_bound_alive_scores = lower_bound_alive_scores_1 / max_length_penalty lowest_score_of_finished_in_finish = layers.reduce_min(finished_scores * finished_in_finished, dim=1) finished_in_finished = layers.cast(finished_in_finished, 'bool') lowest_score_of_finished_in_finish += \ ((1.0 - layers.cast(layers.reduce_any(finished_in_finished, 1), 'float32')) * -INF) #print lowest_score_of_finished_in_finish bound_is_met = layers.reduce_all(layers.greater_than(lowest_score_of_finished_in_finish, lower_bound_alive_scores)) decode_length = source_length + 50 length_cond = layers.less_than(x=step_idx, y=decode_length) return layers.logical_and(x=layers.logical_not(bound_is_met), y=length_cond)
def is_finished(alive_log_prob, finished_scores, finished_in_finished): max_out_len = 200 max_length_penalty = layers.pow( layers.fill_constant([1], dtype='float32', value=((5.0 + max_out_len) / 6.0)), alpha) lower_bound_alive_score = layers.slice( alive_log_prob, starts=[0], ends=[1], axes=[0]) / max_length_penalty lowest_score_of_fininshed_in_finished = finished_scores * finished_in_finished lowest_score_of_fininshed_in_finished += ( 1.0 - finished_in_finished) * -INF lowest_score_of_fininshed_in_finished = layers.reduce_min( lowest_score_of_fininshed_in_finished) met = layers.less_than( lower_bound_alive_score, lowest_score_of_fininshed_in_finished) met = layers.cast(met, 'float32') bound_is_met = layers.reduce_sum(met) finished_eos_num = layers.reduce_sum(finished_in_finished) finish_cond = layers.less_than( finished_eos_num, layers.fill_constant([1], dtype='float32', value=beam_size)) return finish_cond
def _matrix_nms(bboxes, cate_labels, cate_scores, kernel='gaussian', sigma=2.0): """Matrix NMS for multi-class bboxes. Args: bboxes (Tensor): shape (n, 4) cate_labels (Tensor): shape (n), mask labels in descending order cate_scores (Tensor): shape (n), mask scores in descending order kernel (str): 'linear' or 'gaussian' sigma (float): std in gaussian method Returns: Tensor: cate_scores_update, tensors of shape (n) """ n_samples = len(cate_labels) if n_samples == 0: return [] # 计算一个n×n的IOU矩阵,两组矩形两两之间的IOU iou_matrix = jaccard(bboxes, bboxes) # shape: [n_samples, n_samples] iou_matrix = paddle.triu(iou_matrix, diagonal=1) # 只取上三角部分 # label_specific matrix. cate_labels_x = L.expand(L.reshape(cate_labels, (1, -1)), [n_samples, 1]) # shape: [n_samples, n_samples] # 第i行第j列表示的是第i个预测框和第j个预测框的类别id是否相同。我们抑制的是同类的预测框。 d = cate_labels_x - L.transpose(cate_labels_x, [1, 0]) d = L.pow(d, 2) # 同类处为0,非同类处>0。 tf中用 == 0比较无效,所以用 < 1 label_matrix = paddle.triu(L.cast(d < 1, 'float32'), diagonal=1) # shape: [n_samples, n_samples] # IoU compensation # 非同类的iou置为0,同类的iou保留。逐列取最大iou compensate_iou = L.reduce_max(iou_matrix * label_matrix, [0, ]) # shape: [n_samples, ] # compensate_iou第0行里的值a0(重复了n_samples次)表示第0个物体与 比它分高 的 同类物体的最高iou为a0, # compensate_iou第1行里的值a1(重复了n_samples次)表示第1个物体与 比它分高 的 同类物体的最高iou为a1,... # compensate_iou里每一列里的值依次代表第0个物体、第1个物体、...、第n_samples-1个物体与 比它自己分高 的 同类物体的最高iou。 compensate_iou = L.transpose(L.expand(L.reshape(compensate_iou, (1, -1)), [n_samples, 1]), [1, 0]) # shape: [n_samples, n_samples] # IoU decay # 非同类的iou置为0,同类的iou保留。 # decay_iou第i行第j列表示的是第i个预测框和第j个预测框的iou,如果不是同类,该iou置0。且只取上三角部分。 decay_iou = iou_matrix * label_matrix # shape: [n_samples, n_samples] # matrix nms if kernel == 'gaussian': decay_matrix = L.exp(-1 * sigma * (decay_iou ** 2)) compensate_matrix = L.exp(-1 * sigma * (compensate_iou ** 2)) decay_coefficient = L.reduce_sum(decay_matrix / compensate_matrix, [0, ]) elif kernel == 'linear': # 看第j列。(1_test_matrixnms.py里的例子,看第2列) # decay_iou 里第2列里的值为[0.9389, 0.9979, 0, 0]。第2个物体与比它分高的2个同类物体的iou是0.9389, 0.9979。 # compensate_iou里第2列里的值为[0, 0.9409, 0.9979, 0]。比第2个物体分高的2个同类物体 与 比它们自己分高 的 同类物体的最高iou 是0, 0.9409。 # decay_matrix 里第2列里的值为[0.0610, 0.0348, 485.28, 1]。取该列的最小值为0.0348(抑制掉第2个物体的是第1个物体)。其实后面2个值不用看,因为它们总是>=1。 # 总结:decay_matrix里第j列里的第i个值若为最小值,则抑制掉第j个物体的是第i个物体。 # 而且,表现为decay_iou尽可能大,decay_matrix才会尽可能小。 decay_matrix = (1-decay_iou)/(1-compensate_iou) decay_coefficient = L.reduce_min(decay_matrix, [0, ]) else: raise NotImplementedError # 更新分数 cate_scores_update = cate_scores * decay_coefficient return cate_scores_update
def early_finish(alive_log_probs, finished_scores, finished_in_finished): max_length_penalty = np.power(((5. + max_len) / 6.), alpha) # The best possible score of the most likely alive sequence lower_bound_alive_scores = alive_log_probs[:, 0] / max_length_penalty # Now to compute the lowest score of a finished sequence in finished # If the sequence isn't finished, we multiply it's score by 0. since # scores are all -ve, taking the min will give us the score of the lowest # finished item. lowest_score_of_fininshed_in_finished = layers.reduce_min( finished_scores * finished_in_finished, 1) # If none of the sequences have finished, then the min will be 0 and # we have to replace it by -ve INF if it is. The score of any seq in alive # will be much higher than -ve INF and the termination condition will not # be met. lowest_score_of_fininshed_in_finished += ( 1. - layers.reduce_max(finished_in_finished, 1)) * -inf bound_is_met = layers.reduce_all( layers.greater_than(lowest_score_of_fininshed_in_finished, lower_bound_alive_scores)) return bound_is_met
def norm_img(self, x): mx = reduce_max(x) mn = reduce_min(x) x = 255 * (x - mn) / (mn - mx) # 原为(mn-mx) 255 * return x
def norm_img(self, x): mx = layers.reduce_max(x) mn = layers.reduce_min(x) x = 255 * (x - mn) / (mn - mx) return x
def decode(self, encoder_out, text_positions, speaker_embed=None, test_inputs=None): """Decode from the encoder's output and other conditions. Args: encoder_out (keys, values): keys (Variable): shape(B, T_enc, C_emb), dtype float32, the key representation from an encoder, where C_emb means text embedding size. values (Variable): shape(B, T_enc, C_emb), dtype float32, the value representation from an encoder, where C_emb means text embedding size. text_positions (Variable): shape(B, T_enc), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. speaker_embed (Variable, optional): shape(B, C_sp), speaker embedding, only used for multispeaker model. test_inputs (Variable, optional): shape(B, T_test, C_mel). test input, it is only used for debugging. Defaults to None. Returns: outputs (Variable): shape(B, T_mel, C_mel), dtype float32, decoder outputs, where C_mel means the channels of mel-spectrogram, T_mel means the length(time steps) of mel spectrogram. alignments (Variable): shape(N, B, T_mel // r, T_enc), dtype float32, the alignment tensor between the decoder and the encoder, where N means number of Attention Layers, T_mel means the length of mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. done (Variable): shape(B, T_mel // r), dtype float32, probability that the last frame has been generated. If the probability is larger than 0.5 at a step, the generation stops. decoder_states (Variable): shape(B, T_mel, C_dec // r), ddtype float32, decoder hidden states, where C_dec means the channels of decoder states (the output channels of the last `convolutions`). Note that it should be perfectlt devided by `r`. Note: Only single instance inference is supported now, so B = 1. """ self.start_sequence() keys, values = encoder_out batch_size = keys.shape[0] assert batch_size == 1, "now only supports single instance inference" mask = None # no mask because we use single instance decoding # no dropout in inference if speaker_embed is not None: speaker_embed = F.dropout( speaker_embed, self.dropout, dropout_implementation="upscale_in_train") # since we use single example inference, there is no text_mask if text_positions is not None: w = self.key_position_rate if self.n_speakers > 1: # shape (B, ) w = w * F.squeeze(self.speaker_proj1(speaker_embed), [-1]) text_pos_embed = self.embed_keys_positions(text_positions, w) keys += text_pos_embed # (B, T, C) # statr decoding decoder_states = [] # (B, C, 1) tensors mel_outputs = [] # (B, C, 1) tensors alignments = [] # (B, 1, T_enc) tensors dones = [] # (B, 1, 1) tensors last_attended = [None] * len(self.conv_attn) for idx, monotonic_attn in enumerate(self.force_monotonic_attention): if monotonic_attn: last_attended[idx] = 0 if test_inputs is not None: # pack multiple frames if necessary # assume (B, T, C) input test_inputs = fold_adjacent_frames(test_inputs, self.r) test_inputs = F.transpose(test_inputs, [0, 2, 1]) initial_input = F.zeros((batch_size, self.mel_dim * self.r, 1), dtype=keys.dtype) t = 0 # decoder time step while True: frame_pos = F.fill_constant((batch_size, 1), value=t + 1, dtype="int64") w = self.query_position_rate if self.n_speakers > 1: w = w * F.squeeze(self.speaker_proj2(speaker_embed), [-1]) # (B, T=1, C) frame_pos_embed = self.embed_query_positions(frame_pos, w) if test_inputs is not None: if t >= test_inputs.shape[-1]: break current_input = test_inputs[:, :, t:t + 1] else: if t > 0: current_input = mel_outputs[-1] # auto-regressive else: current_input = initial_input x_t = current_input x_t = F.dropout(x_t, self.dropout, dropout_implementation="upscale_in_train") # Prenet for layer in self.prenet: if isinstance(layer, Conv1DGLU): x_t = layer.add_input(x_t, speaker_embed) else: x_t = layer(x_t) # (B, C, T=1) step_attn_scores = [] # causal convolutions + multi-hop attentions for i, (conv, attn) in enumerate(self.conv_attn): residual = x_t #(B, C, T=1) x_t = conv.add_input(x_t, speaker_embed) if attn is not None: x_t = F.transpose(x_t, [0, 2, 1]) if frame_pos_embed is not None: x_t += frame_pos_embed x_t, attn_scores = attn( x_t, (keys, values), mask, last_attended[i] if test_inputs is None else None) x_t = F.transpose(x_t, [0, 2, 1]) step_attn_scores.append(attn_scores) #(B, T_dec=1, T_enc) # update last attended when necessary if self.force_monotonic_attention[i]: last_attended[i] = np.argmax(attn_scores.numpy(), axis=-1)[0][0] x_t = F.scale(residual + x_t, np.sqrt(0.5)) if len(step_attn_scores): # (B, 1, T_enc) again average_attn_scores = F.reduce_mean( F.stack(step_attn_scores, 0), 0) else: average_attn_scores = None decoder_state_t = x_t x_t = self.last_conv(x_t) mel_output_t = F.sigmoid(x_t) done_t = F.sigmoid(self.fc(x_t)) decoder_states.append(decoder_state_t) mel_outputs.append(mel_output_t) if average_attn_scores is not None: alignments.append(average_attn_scores) dones.append(done_t) t += 1 if test_inputs is None: if F.reduce_min(done_t).numpy( )[0] > 0.5 and t > self.min_decoder_steps: break elif t > self.max_decoder_steps: break # concat results mel_outputs = F.concat(mel_outputs, axis=-1) decoder_states = F.concat(decoder_states, axis=-1) dones = F.concat(dones, axis=-1) alignments = F.concat(alignments, axis=1) mel_outputs = F.transpose(mel_outputs, [0, 2, 1]) decoder_states = F.transpose(decoder_states, [0, 2, 1]) dones = F.squeeze(dones, [1]) mel_outputs = unfold_adjacent_frames(mel_outputs, self.r) decoder_states = unfold_adjacent_frames(decoder_states, self.r) return mel_outputs, alignments, dones, decoder_states
def norm_range(t, range): if range is not None: norm_ip(t, range[0], range[1]) else: norm_ip(t, float(F.reduce_min(t)), float(F.reduce_max(t)))