def forward(self, pred, label): label = label.reshape(pred.shape) sample_weight = label != self._ignore_label label = paddle.where(sample_weight, label, paddle.zeros_like(label)) if not self._from_sigmoid: loss = F.relu(pred) - pred * label + F.softplus(-paddle.abs(pred)) else: eps = 1e-12 loss = -(paddle.log(pred + eps) * label + paddle.log(1. - pred + eps) * (1. - label)) loss = self._weight * (loss * sample_weight) return paddle.mean(loss, axis=misc.get_dims_with_exclusion( len(loss.shape), self._batch_axis))
def bbox_overlaps(boxes1, boxes2): area1 = bbox_area(boxes1) area2 = bbox_area(boxes2) xy_max = paddle.minimum( paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:]) xy_min = paddle.maximum( paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2]) width_height = xy_max - xy_min width_height = width_height.clip(min=0) inter = width_height.prod(axis=2) overlaps = paddle.where( inter > 0, inter / (paddle.unsqueeze(area1, 1) + area2 - inter), paddle.zeros_like(inter)) return overlaps
def forward(self, feature, label): cos_theta = paddle.mm(F.normalize(feature, axis=1), F.normalize(self.weight, axis=0)) sin_theta = paddle.sqrt( paddle.clip(1.0 - paddle.pow(cos_theta, 2), min=0, max=1)) cos_theta_m = cos_theta * self.cos_m - sin_theta * self.sin_m cos_theta_m = paddle.where(cos_theta > self.threshold, cos_theta_m, cos_theta - self.mm) one_hot = paddle.nn.functional.one_hot(label, self.class_dim) output = (one_hot * cos_theta_m) + (paddle.abs( (1.0 - one_hot)) * cos_theta) output *= self.s # 简单的分类方法,学习率需要设置为0.1 # cosine = self.cosine_sim(feature, self.weight) # one_hot = paddle.nn.functional.one_hot(label, self.class_dim) # output = self.s * (cosine - one_hot * self.m) return output
def forward(self, input, target): if self.log_target: out = paddle.exp(target) * (target - input) else: out_pos = target * (paddle.log(target) - input) zeros = paddle.zeros_like(out_pos) out = paddle.where(target > 0, out_pos, zeros) out_sum = paddle.sum(out) if self.reduction == "sum": return out_sum elif self.reduction == "batchmean": n = input.shape[0] return out_sum / n elif self.reduction == "mean": return paddle.mean(out) else: return out
def test_api(self, use_cuda=False): for x_stop_gradient in [False, True]: for y_stop_gradient in [False, True]: with fluid.program_guard(Program(), Program()): cond = fluid.layers.data(name='cond', shape=self.shape, dtype='bool') x = fluid.layers.data(name='x', shape=self.shape, dtype='float32') y = fluid.layers.data(name='y', shape=self.shape, dtype='float32') x.stop_gradient = x_stop_gradient y.stop_gradient = y_stop_gradient result = paddle.where(cond, x, y) append_backward(layers.mean(result)) for use_cuda in [False, True]: if (use_cuda and (not fluid.core.is_compiled_with_cuda())): break place = (fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()) exe = fluid.Executor(place) fetch_list = [result, result.grad_name] if (x_stop_gradient is False): fetch_list.append(x.grad_name) if (y_stop_gradient is False): fetch_list.append(y.grad_name) out = exe.run(fluid.default_main_program(), feed={ 'cond': self.cond, 'x': self.x, 'y': self.y }, fetch_list=fetch_list) assert np.array_equal(out[0], self.out) if (x_stop_gradient is False): assert np.array_equal(out[2], self.ref_x_backward(out[1])) if (y.stop_gradient is False): assert np.array_equal( out[3], self.ref_y_backward(out[1])) elif (y.stop_gradient is False): assert np.array_equal(out[2], self.ref_y_backward(out[1]))
def forward(self, bond_types_batch, type_count_batch, bond_feat): """ Input example: bond_types_batch: [0,0,2,0,1,2] + [0,0,2,0,1,2] + [2] type_count_batch: [[3, 3, 0], [1, 1, 0], [2, 2, 1]] # [num_type, batch_size] """ bond_feat = self.fc_1( paddle.reshape(bond_feat, [-1, self.num_angle * self.bond_dim])) inter_mat_list = [] for type_i in range(self.num_type): type_i_index = paddle.masked_select(paddle.arange(len(bond_feat)), bond_types_batch == type_i) if paddle.sum(type_count_batch[type_i]) == 0: inter_mat_list.append( paddle.to_tensor(np.array([0.] * len(type_count_batch[type_i])), dtype='float32')) continue bond_feat_type_i = paddle.gather(bond_feat, type_i_index) graph_bond_index = op.get_index_from_counts( type_count_batch[type_i]) # graph_bond_id = generate_segment_id_from_index(graph_bond_index) graph_bond_id = generate_segment_id(graph_bond_index) graph_feat_type_i = math.segment_pool(bond_feat_type_i, graph_bond_id, pool_type='sum') mat_flat_type_i = self.fc_2(graph_feat_type_i).squeeze(1) # print(graph_bond_id) # print(graph_bond_id.shape, graph_feat_type_i.shape, mat_flat_type_i.shape) my_pad = nn.Pad1D(padding=[ 0, len(type_count_batch[type_i]) - len(mat_flat_type_i) ], value=-1e9) mat_flat_type_i = my_pad(mat_flat_type_i) inter_mat_list.append(mat_flat_type_i) inter_mat_batch = paddle.stack(inter_mat_list, axis=1) # [batch_size, num_type] inter_mat_mask = paddle.ones_like(inter_mat_batch) * -1e9 inter_mat_batch = paddle.where( type_count_batch.transpose([1, 0]) > 0, inter_mat_batch, inter_mat_mask) inter_mat_batch = self.softmax(inter_mat_batch) return inter_mat_batch
def box_overlap_opr(box, gt): assert box.ndim == 2 assert gt.ndim == 2 area_box = (box[:, 2] - box[:, 0] + 1) * (box[:, 3] - box[:, 1] + 1) area_gt = (gt[:, 2] - gt[:, 0] + 1) * (gt[:, 3] - gt[:, 1] + 1) width_height = torch.minimum( box[:, 2:].unsqueeze(axis=-2), gt[:, 2:]) - torch.maximum( box[:, :2].unsqueeze(axis=-2), gt[:, :2]) + 1 # [N,M,2] width_height = clamp(width_height, min=0, max=float('inf')) # [N,M,2] inter = width_height.prod(axis=2) # [N,M] del width_height # handle empty boxes iou = torch.where( inter > 0, inter / (area_box.unsqueeze(axis=-1) + area_gt - inter), torch.zeros(torch.to_tensor([1]), dtype=inter.dtype), ) return iou
def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False, rel_pos=None, rel_2d_pos=None, ): q, k, v = self.compute_qkv(hidden_states) # (B, L, H*D) -> (B, H, L, D) query_layer = self.transpose_for_scores(q) key_layer = self.transpose_for_scores(k) value_layer = self.transpose_for_scores(v) query_layer = query_layer / math.sqrt(self.attention_head_size) # [BSZ, NAT, L, L] attention_scores = paddle.matmul(query_layer, key_layer.transpose([0, 1, 3, 2])) if self.has_relative_attention_bias: attention_scores += rel_pos if self.has_spatial_attention_bias: attention_scores += rel_2d_pos attention_scores = paddle.where( attention_mask.astype(paddle.bool).expand_as(attention_scores), paddle.ones_like(attention_scores) * float("-inf"), attention_scores) attention_probs = F.softmax(attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) context_layer = paddle.matmul(attention_probs, value_layer) context_layer = context_layer.transpose([0, 2, 1, 3]) new_context_layer_shape = context_layer.shape[:-2] + [ self.all_head_size ] context_layer = context_layer.reshape(new_context_layer_shape) outputs = (context_layer, attention_probs) if output_attentions else ( context_layer, ) return outputs
def test_api(self): for x_stop_gradient in [False, True]: for y_stop_gradient in [False, True]: train_prog = fluid.Program() startup = fluid.Program() with fluid.program_guard(train_prog, startup): cond = fluid.data(name='cond', shape=self.shape, dtype='bool') x = fluid.data(name='x', shape=self.shape, dtype='float32') y = fluid.data(name='y', shape=self.shape, dtype='float32') x.stop_gradient = x_stop_gradient y.stop_gradient = y_stop_gradient result = paddle.where(cond, x, y) append_backward(fluid.layers.mean(result)) exe = fluid.Executor(self.place) exe.run(startup) fetch_list = [result, result.grad_name] if x_stop_gradient is False: fetch_list.append(x.grad_name) if y_stop_gradient is False: fetch_list.append(y.grad_name) out = exe.run(train_prog, feed={ 'cond': self.cond, 'x': self.x, 'y': self.y }, fetch_list=fetch_list) assert np.array_equal(out[0], self.out) if x_stop_gradient is False: assert np.array_equal(out[2], self.ref_x_backward(out[1])) if y.stop_gradient is False: assert np.array_equal(out[3], self.ref_y_backward(out[1])) elif y.stop_gradient is False: assert np.array_equal(out[2], self.ref_y_backward(out[1]))
def test_api_broadcast(self, use_cuda=False): train_prog = fluid.Program() startup = fluid.Program() with fluid.program_guard(train_prog, startup): x = fluid.layers.data(name='x', shape=[4, 1], dtype='float32') y = fluid.layers.data(name='y', shape=[4, 2], dtype='float32') x_i = np.array([[0.9383, 0.1983, 3.2, 1.2]]).astype("float32") y_i = np.array([[1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0]]).astype("float32") result = paddle.where(x > 1, x=x, y=y) exe = fluid.Executor(self.place) exe.run(startup) out = exe.run(train_prog, feed={'x': x_i, 'y': y_i}, fetch_list=[result]) assert np.array_equal(out[0], np.where(x_i > 1, x_i, y_i))
def forward(self, x, lengths=None): C, L = x.shape[1], x.shape[2] # KP: (N, C, L) def _compute_statistics(x, m, axis=2, eps=self.eps): mean = (m * x).sum(axis) std = paddle.sqrt( (m * (x - mean.unsqueeze(axis)).pow(2)).sum(axis).clip(eps)) return mean, std if lengths is None: lengths = paddle.ones([x.shape[0]]) # Make binary mask of shape [N, 1, L] mask = length_to_mask(lengths * L, max_len=L) mask = mask.unsqueeze(1) # Expand the temporal context of the pooling layer by allowing the # self-attention to look at global properties of the utterance. if self.global_context: total = mask.sum(axis=2, keepdim=True).astype('float32') mean, std = _compute_statistics(x, mask / total) mean = mean.unsqueeze(2).tile((1, 1, L)) std = std.unsqueeze(2).tile((1, 1, L)) attn = paddle.concat([x, mean, std], axis=1) else: attn = x # Apply layers attn = self.conv(self.tanh(self.tdnn(attn))) # Filter out zero-paddings attn = paddle.where( mask.tile((1, C, 1)) == 0, paddle.ones_like(attn) * float("-inf"), attn) attn = F.softmax(attn, axis=2) mean, std = _compute_statistics(x, attn) # Append mean and std of the batch pooled_stats = paddle.concat((mean, std), axis=1) pooled_stats = pooled_stats.unsqueeze(2) return pooled_stats
def _compute_loss(self, prediction_tensor, target_tensor, weights, class_indices=None): """Compute loss function. Args: prediction_tensor: A float tensor of shape [batch_size, num_anchors, num_classes] representing the predicted logits for each class target_tensor: A float tensor of shape [batch_size, num_anchors, num_classes] representing one-hot encoded classification targets weights: a float tensor of shape [batch_size, num_anchors] class_indices: (Optional) A 1-D integer tensor of class indices. If provided, computes loss only for the specified class indices. Returns: loss: a float tensor of shape [batch_size, num_anchors, num_classes] representing the value of the loss function. """ weights = weights.unsqueeze(2) if class_indices is not None: weights *= indices_to_dense_vector(class_indices, prediction_tensor.shape[2]).reshape((1, 1, -1)).astype(prediction_tensor.dtype) per_entry_cross_ent = (_softmax_cross_entropy_with_logits( labels=target_tensor, logits=prediction_tensor)) # convert [N, num_anchors] to [N, num_anchors, num_classes] per_entry_cross_ent = per_entry_cross_ent.unsqueeze(-1) * target_tensor prediction_probabilities = F.softmax(prediction_tensor, axis=-1) p_t = ((target_tensor * prediction_probabilities) + ((1 - target_tensor) * (1 - prediction_probabilities))) modulating_factor = 1.0 if self._gamma: modulating_factor = paddle.pow(1.0 - p_t, self._gamma) alpha_weight_factor = 1.0 if self._alpha is not None: alpha_weight_factor = paddle.where(target_tensor[..., 0] == 1, paddle.to_tensor(1 - self._alpha).astype(per_entry_cross_ent.dtype), paddle.to_tensor(self._alpha).astype(per_entry_cross_ent.dtype)) focal_cross_entropy_loss = (modulating_factor * alpha_weight_factor * per_entry_cross_ent) return focal_cross_entropy_loss * weights
def get_discriminator_inputs(self, inputs, raw_inputs, gen_logits, gen_labels, use_softmax_sample): """Sample from the generator to create discriminator input.""" # get generator token result sampled_tokens = (self.sample_from_softmax( gen_logits, use_softmax_sample)).detach() sampled_tokids = paddle.argmax(sampled_tokens, axis=-1) # update token only at mask position # gen_labels : [B, L], L contains -100(unmasked) or token value(masked) # mask_positions : [B, L], L contains 0(unmasked) or 1(masked) umask_positions = paddle.zeros_like(gen_labels) mask_positions = paddle.ones_like(gen_labels) mask_positions = paddle.where(gen_labels == -100, umask_positions, mask_positions) updated_inputs = self.update_inputs(inputs, sampled_tokids, mask_positions) # use inputs and updated_input to get discriminator labels labels = mask_positions * (paddle.ones_like(inputs) - paddle.equal( updated_inputs, raw_inputs).astype("int32")) return updated_inputs, labels, sampled_tokids
def rough_ROI(ref_scribble_labels): #### b*1*h*w dist = 20 b, _, h, w = ref_scribble_labels.shape filter_ = paddle.zeros_like(ref_scribble_labels) to_fill = paddle.zeros_like(ref_scribble_labels) for i in range(b): no_background = (ref_scribble_labels[i] != -1) no_background = no_background.squeeze(0) no_b = no_background.nonzero() (h_min, w_min) = paddle.min(no_b, 0) (h_max, w_max) = paddle.max(no_b, 0) filter_[i, 0, max(h_min - dist, 0):min(h_max + dist, h - 1), max(w_min - dist, 0):min(w_max + dist, w - 1)] = 1 final_scribble_labels = paddle.where(byte_(filter_), ref_scribble_labels, to_fill) return final_scribble_labels
def forward(self, pred, target, reduction='none'): """forward function, based on fvcore. Args: pred (Tensor): prediction tensor target (Tensor): target tensor, pred.shape must be the same as target.shape reduction (str): the way to reduce loss, one of (none, sum, mean) """ assert reduction in ('none', 'sum', 'mean') target = target.detach() if self.beta < 1e-5: loss = paddle.abs(pred - target) else: n = paddle.abs(pred - target) cond = n < self.beta loss = paddle.where(cond, 0.5 * n ** 2 / self.beta, n - 0.5 * self.beta) if reduction == 'mean': loss = loss.mean() if loss.size > 0 else 0.0 * loss.sum() elif reduction == 'sum': loss = loss.sum() return loss * self.loss_weight
def _compute_iou(pred_mask, gt_mask, ignore_mask=None, keep_ignore=False): if ignore_mask is not None: pred_mask = paddle.where(ignore_mask, paddle.zeros_like(pred_mask.astype('float32')), pred_mask.astype('float32')) reduction_dims = misc.get_dims_with_exclusion(len(gt_mask.shape), 0) pred_mask = pred_mask.astype('bool') m = pred_mask.numpy() | gt_mask.numpy() n = pred_mask.numpy() & gt_mask.numpy() union = np.mean(m.astype('float'), axis=tuple(reduction_dims)) intersection = np.mean(n.astype('float'), axis=tuple(reduction_dims)) nonzero = union > 0 iou = intersection[nonzero] / union[nonzero] if not keep_ignore: return iou else: result = np.full_like(intersection, -1) result[nonzero] = iou return result
def forward(self, generator_prediction_scores, discriminator_prediction_scores, generator_labels, discriminator_labels): # generator loss gen_loss = self.gen_loss_fct( paddle.reshape(generator_prediction_scores, [-1, self.vocab_size]), paddle.reshape(generator_labels, [-1])) # todo: we can remove 4 lines after when CrossEntropyLoss(reduction='mean') improved umask_positions = paddle.zeros_like(generator_labels).astype("float32") mask_positions = paddle.ones_like(generator_labels).astype("float32") mask_positions = paddle.where(generator_labels == -100, umask_positions, mask_positions) gen_loss = gen_loss.sum() / mask_positions.sum() # discriminator loss seq_length = discriminator_labels.shape[1] disc_loss = self.disc_loss_fct( paddle.reshape(discriminator_prediction_scores, [-1, seq_length]), discriminator_labels.astype("float32")) return self.gen_weight * gen_loss + self.disc_weight * disc_loss
def test_api_broadcast(self, use_cuda=False): main_program = Program() with fluid.program_guard(main_program): x = fluid.layers.data(name='x', shape=[4, 1], dtype='float32') y = fluid.layers.data(name='y', shape=[4, 2], dtype='float32') x_i = np.array([[0.9383, 0.1983, 3.2, 1.2]]).astype('float32') y_i = np.array([[1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0]]).astype('float32') result = paddle.where((x > 1), x=x, y=y) for use_cuda in [False, True]: if (use_cuda and (not fluid.core.is_compiled_with_cuda())): return place = (fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()) exe = fluid.Executor(place) out = exe.run(fluid.default_main_program(), feed={ 'x': x_i, 'y': y_i }, fetch_list=[result]) assert np.array_equal(out[0], np.where((x_i > 1), x_i, y_i))
def forward(self, logits, label): """ Forward computation. Args: logits (tuple|list): (seg_logit, edge_logit) Tensor, the data type is float32, float64. Shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1. C =1 of edge_logit . label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1. """ seg_logit, edge_logit = logits[0], logits[1] if len(label.shape) != len(seg_logit.shape): label = paddle.unsqueeze(label, 1) if edge_logit.shape != label.shape: raise ValueError( 'The shape of edge_logit should equal to the label, but they are {} != {}' .format(edge_logit.shape, label.shape)) filler = paddle.ones_like(label) * self.ignore_index label = paddle.where(edge_logit > self.edge_threshold, label, filler) seg_logit = paddle.transpose(seg_logit, [0, 2, 3, 1]) label = paddle.transpose(label, [0, 2, 3, 1]) loss = F.softmax_with_cross_entropy(seg_logit, label, ignore_index=self.ignore_index, axis=-1) mask = label != self.ignore_index mask = paddle.cast(mask, 'float32') loss = loss * mask avg_loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS) if paddle.mean(mask) < self.mean_mask: self.mean_mask = paddle.mean(mask) label.stop_gradient = True mask.stop_gradient = True return avg_loss
def test_scalar(self): paddle.enable_static() main_program = Program() with fluid.program_guard(main_program): cond_shape = [2, 4] cond = fluid.layers.data(name='cond', shape=cond_shape, dtype='bool') x_data = 1.0 y_data = 2.0 cond_data = np.array([False, False, True, True]).astype('bool') result = paddle.where(condition=cond, x=x_data, y=y_data) for use_cuda in [False, True]: if (use_cuda and (not fluid.core.is_compiled_with_cuda())): return place = (fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()) exe = fluid.Executor(place) out = exe.run(fluid.default_main_program(), feed={'cond': cond_data}, fetch_list=[result]) expect = np.where(cond_data, x_data, y_data) assert np.array_equal(out[0], expect)
def mask_tokens(self, examples): if self.tokenizer.mask_token is None: raise ValueError( "the tokenizer does not have mask_token, please check!") mask_token_id = self.tokenizer.convert_tokens_to_ids( self.tokenizer.mask_token) raw_inputs, probability_matrix = self.add_special_tokens_and_set_maskprob( examples, True, self.max_seq_length) raw_inputs = self.tensorize_batch(raw_inputs, "int64") probability_matrix = self.tensorize_batch(probability_matrix, "float32") inputs = raw_inputs.clone() labels = raw_inputs.clone() total_indices = paddle.bernoulli(probability_matrix).astype( "bool").numpy() labels[~total_indices] = -100 # 80% MASK indices_mask = paddle.bernoulli(paddle.full(labels.shape, 0.8)).astype( "bool").numpy() & total_indices inputs[indices_mask] = mask_token_id # 10% Random indices_random = paddle.bernoulli(paddle.full( labels.shape, 0.5)).astype("bool").numpy( ) & total_indices & ~indices_mask random_words = paddle.randint( low=0, high=self.tokenizer.vocab_size, shape=labels.shape, dtype="int64") inputs = paddle.where( paddle.to_tensor(indices_random), random_words, inputs) # 10% Original return inputs, raw_inputs, labels
def _nn_features_per_object_for_chunk(reference_embeddings, query_embeddings, wrong_label_mask, k_nearest_neighbors, ys): """Extracts features for each object using nearest neighbor attention. Args: reference_embeddings: Tensor of shape [n_chunk, embedding_dim], the embedding vectors for the reference frame. query_embeddings: Tensor of shape [m_chunk, embedding_dim], the embedding vectors for the query frames. wrong_label_mask: k_nearest_neighbors: Integer, the number of nearest neighbors to use. Returns: nn_features: A float32 tensor of nearest neighbor features of shape [m_chunk, n_objects, feature_dim]. """ # reference_embeddings_key = reference_embeddings # query_embeddings_key = query_embeddings dists, ys = _flattened_pairwise_distances(reference_embeddings, query_embeddings, ys) dists = (paddle.unsqueeze(dists, 1) + paddle.unsqueeze(float_(wrong_label_mask), 0) * WRONG_LABEL_PADDING_DISTANCE) if k_nearest_neighbors == 1: features = paddle.min(dists, 2, keepdim=True) else: dists, _ = paddle.topk(-dists, k=k_nearest_neighbors, axis=2) dists = -dists valid_mask = (dists < WRONG_LABEL_PADDING_DISTANCE) masked_dists = dists * valid_mask.float() pad_dist = paddle.max(masked_dists, axis=2, keepdim=True)[0].tile( (1, 1, masked_dists.shape[-1])) dists = paddle.where(valid_mask, dists, pad_dist) # take mean of distances features = paddle.mean(dists, axis=2, keepdim=True) return features, ys
def _compute_expert_weights(self): """Computes the weight vector for the experts. Args: None. Returns: A tuple: (expert_weights, selector_outputs). expert_weights is the final weight vector of the experts. selector_outputs is a (num_nonzero, num_experts)-matrix whose i-th row represents the outputs of the i-th single-expert selector. """ # Shape = (num_nonzero, 1, num_binary) smooth_step_activations = self._smooth_step(self._z_logits) # Shape = (num_nonzero, num_experts) selector_outputs = paddle.prod(paddle.where( self._binary_codes, smooth_step_activations, 1 - smooth_step_activations), axis=2) # Weights for the single-expert selectors: shape = (num_nonzero, 1) selector_weights = F.softmax(self._w_logits, axis=0) expert_weights = paddle.sum(selector_weights * selector_outputs, axis=0) return expert_weights, selector_outputs
def forward(self, inputs): querys, keys, sess_length = inputs #assert(type(sess_length) == paddle.Tensor), f"At Attention SequencePoolingLayer expected inputs[2]'s type is paddle.Tensor, but got {type(sess_length)}" keys_length = keys.shape[1] key_masks = nn.functional.sequence_mask(sess_length, keys_length) querys = paddle.tile(querys.unsqueeze(1), [1, keys_length, 1]) att_input = paddle.concat([querys, keys, querys - keys, querys * keys], axis=-1) for i, layer in enumerate(self.layers): att_input = layer(att_input) #att_input = self.bn_layer[i](att_input) # BatchNomalization att_input = self.activation(att_input) # activation att_score = self.dnn(att_input) # (N, 50, 1) att_score = paddle.transpose(att_score, [0, 2, 1]) # (N, 1, 50) if self.weight_normalization: paddings = paddle.ones_like(att_score) * (-2**32 + 1) else: paddings = paddle.zeros_like(att_score) att_score = paddle.where( key_masks.unsqueeze(1) == 1, att_score, paddings ) # key_masks.unsqueeze in order to keep shape same as att_score att_score = self.soft(att_score) out = paddle.matmul(att_score, keys) return out
def forward(self, x): h = F.relu(self.conv1_1(x)) h = F.relu(self.conv1_2(h)) h = F.max_pool2d(h, 2, 2) h = F.relu(self.conv2_1(h)) h = F.relu(self.conv2_2(h)) h = F.max_pool2d(h, 2, 2) h = F.relu(self.conv3_1(h)) h = F.relu(self.conv3_2(h)) h = F.relu(self.conv3_3(h)) f3_3 = h h = F.max_pool2d(h, 2, 2) h = F.relu(self.conv4_1(h)) h = F.relu(self.conv4_2(h)) h = F.relu(self.conv4_3(h)) f4_3 = h h = F.max_pool2d(h, 2, 2) h = F.relu(self.conv5_1(h)) h = F.relu(self.conv5_2(h)) h = F.relu(self.conv5_3(h)) f5_3 = h h = F.max_pool2d(h, 2, 2) h = F.relu(self.fc6(h)) h = F.relu(self.fc7(h)) ffc7 = h h = F.relu(self.conv6_1(h)) h = F.relu(self.conv6_2(h)) f6_2 = h h = F.relu(self.conv7_1(h)) h = F.relu(self.conv7_2(h)) f7_2 = h f3_3 = self.conv3_3_norm(f3_3) f4_3 = self.conv4_3_norm(f4_3) f5_3 = self.conv5_3_norm(f5_3) cls1 = self.conv3_3_norm_mbox_conf(f3_3) reg1 = self.conv3_3_norm_mbox_loc(f3_3) cls2 = self.conv4_3_norm_mbox_conf(f4_3) reg2 = self.conv4_3_norm_mbox_loc(f4_3) cls3 = self.conv5_3_norm_mbox_conf(f5_3) reg3 = self.conv5_3_norm_mbox_loc(f5_3) cls4 = self.fc7_mbox_conf(ffc7) reg4 = self.fc7_mbox_loc(ffc7) cls5 = self.conv6_2_mbox_conf(f6_2) reg5 = self.conv6_2_mbox_loc(f6_2) cls6 = self.conv7_2_mbox_conf(f7_2) reg6 = self.conv7_2_mbox_loc(f7_2) # max-out background label chunk = paddle.chunk(cls1, 4, 1) tmp_max = paddle.where(chunk[0] > chunk[1], chunk[0], chunk[1]) bmax = paddle.where(tmp_max > chunk[2], tmp_max, chunk[2]) cls1 = paddle.concat([bmax, chunk[3]], axis=1) return [ cls1, reg1, cls2, reg2, cls3, reg3, cls4, reg4, cls5, reg5, cls6, reg6 ]
def forward(self, query, key, value, key_padding_mask=None, incremental_state=None, attn_mask=None): """ Inputs of forward function query: [target length, batch size, embed dim] key: [sequence length, batch size, embed dim] value: [sequence length, batch size, embed dim] key_padding_mask: if True, mask padding based on batch size incremental_state: if provided, previous time steps are cashed need_weights: output attn_output_weights static_kv: key and value are static Outputs of forward function attn_output: [target length, batch size, embed dim] attn_output_weights: [batch size, target length, sequence length] """ q_shape = paddle.shape(query) src_shape = paddle.shape(key) q = self._in_proj_q(query) k = self._in_proj_k(key) v = self._in_proj_v(value) q *= self.scaling q = paddle.transpose( paddle.reshape( q, [q_shape[0], q_shape[1], self.num_heads, self.head_dim]), [1, 2, 0, 3]) k = paddle.transpose( paddle.reshape( k, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]), [1, 2, 0, 3]) v = paddle.transpose( paddle.reshape( v, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]), [1, 2, 0, 3]) if key_padding_mask is not None: assert key_padding_mask.shape[0] == q_shape[1] assert key_padding_mask.shape[1] == src_shape[0] attn_output_weights = paddle.matmul(q, paddle.transpose(k, [0, 1, 3, 2])) if attn_mask is not None: attn_mask = paddle.unsqueeze(paddle.unsqueeze(attn_mask, 0), 0) attn_output_weights += attn_mask if key_padding_mask is not None: attn_output_weights = paddle.reshape( attn_output_weights, [q_shape[1], self.num_heads, q_shape[0], src_shape[0]]) key = paddle.unsqueeze(paddle.unsqueeze(key_padding_mask, 1), 2) key = paddle.cast(key, 'float32') y = paddle.full(shape=paddle.shape(key), dtype='float32', fill_value='-inf') y = paddle.where(key == 0., key, y) attn_output_weights += y attn_output_weights = F.softmax( attn_output_weights.astype('float32'), axis=-1, dtype=paddle.float32 if attn_output_weights.dtype == paddle.float16 else attn_output_weights.dtype) attn_output_weights = F.dropout(attn_output_weights, p=self.dropout, training=self.training) attn_output = paddle.matmul(attn_output_weights, v) attn_output = paddle.reshape( paddle.transpose(attn_output, [2, 0, 1, 3]), [q_shape[0], q_shape[1], self.embed_dim]) attn_output = self.out_proj(attn_output) return attn_output
def get_pred(self, bboxes, bbox_num, im_shape, scale_factor): """ Rescale, clip and filter the bbox from the output of NMS to get final prediction. Notes: Currently only support bs = 1. Args: bboxes (Tensor): The output bboxes with shape [N, 6] after decode and NMS, including labels, scores and bboxes. bbox_num (Tensor): The number of prediction boxes of each batch with shape [1], and is N. im_shape (Tensor): The shape of the input image. scale_factor (Tensor): The scale factor of the input image. Returns: pred_result (Tensor): The final prediction results with shape [N, 6] including labels, scores and bboxes. """ bboxes_list = [] bbox_num_list = [] id_start = 0 # add fake bbox when output is empty for each batch for i in range(bbox_num.shape[0]): if bbox_num[i] == 0: bboxes_i = self.fake_bboxes bbox_num_i = self.fake_bbox_num id_start += 1 else: bboxes_i = bboxes[id_start:id_start + bbox_num[i], :] bbox_num_i = bbox_num[i] id_start += bbox_num[i] bboxes_list.append(bboxes_i) bbox_num_list.append(bbox_num_i) bboxes = paddle.concat(bboxes_list) bbox_num = paddle.concat(bbox_num_list) origin_shape = paddle.floor(im_shape / scale_factor + 0.5) origin_shape_list = [] scale_factor_list = [] # scale_factor: scale_y, scale_x for i in range(bbox_num.shape[0]): expand_shape = paddle.expand(origin_shape[i:i + 1, :], [bbox_num[i], 2]) scale_y, scale_x = scale_factor[i][0], scale_factor[i][1] scale = paddle.concat([scale_x, scale_y, scale_x, scale_y]) expand_scale = paddle.expand(scale, [bbox_num[i], 4]) origin_shape_list.append(expand_shape) scale_factor_list.append(expand_scale) self.origin_shape_list = paddle.concat(origin_shape_list) scale_factor_list = paddle.concat(scale_factor_list) # bboxes: [N, 6], label, score, bbox pred_label = bboxes[:, 0:1] pred_score = bboxes[:, 1:2] pred_bbox = bboxes[:, 2:] # rescale bbox to original image scaled_bbox = pred_bbox / scale_factor_list origin_h = self.origin_shape_list[:, 0] origin_w = self.origin_shape_list[:, 1] zeros = paddle.zeros_like(origin_h) # clip bbox to [0, original_size] x1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 0], origin_w), zeros) y1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 1], origin_h), zeros) x2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 2], origin_w), zeros) y2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 3], origin_h), zeros) pred_bbox = paddle.stack([x1, y1, x2, y2], axis=-1) # filter empty bbox keep_mask = nonempty_bbox(pred_bbox, return_mask=True) keep_mask = paddle.unsqueeze(keep_mask, [1]) pred_label = paddle.where(keep_mask, pred_label, paddle.ones_like(pred_label) * -1) pred_result = paddle.concat([pred_label, pred_score, pred_bbox], axis=1) return pred_result
def __call__(self, seg_preds, seg_masks, cate_labels, cate_scores, sum_masks=None): # sort and keep top nms_pre sort_inds = self._sort_score(cate_scores, self.pre_nms_top_n) seg_masks = paddle.gather(seg_masks, index=sort_inds) seg_preds = paddle.gather(seg_preds, index=sort_inds) sum_masks = paddle.gather(sum_masks, index=sort_inds) cate_scores = paddle.gather(cate_scores, index=sort_inds) cate_labels = paddle.gather(cate_labels, index=sort_inds) seg_masks = paddle.flatten(seg_masks, start_axis=1, stop_axis=-1) # inter. inter_matrix = paddle.mm(seg_masks, paddle.transpose(seg_masks, [1, 0])) n_samples = paddle.shape(cate_labels) # union. sum_masks_x = paddle.expand(sum_masks, shape=[n_samples, n_samples]) # iou. iou_matrix = (inter_matrix / (sum_masks_x + paddle.transpose(sum_masks_x, [1, 0]) - inter_matrix)) iou_matrix = paddle.triu(iou_matrix, diagonal=1) # label_specific matrix. cate_labels_x = paddle.expand(cate_labels, shape=[n_samples, n_samples]) label_matrix = paddle.cast( (cate_labels_x == paddle.transpose(cate_labels_x, [1, 0])), 'float32') label_matrix = paddle.triu(label_matrix, diagonal=1) # IoU compensation compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0) compensate_iou = paddle.expand(compensate_iou, shape=[n_samples, n_samples]) compensate_iou = paddle.transpose(compensate_iou, [1, 0]) # IoU decay decay_iou = iou_matrix * label_matrix # matrix nms if self.kernel == 'gaussian': decay_matrix = paddle.exp(-1 * self.sigma * (decay_iou**2)) compensate_matrix = paddle.exp(-1 * self.sigma * (compensate_iou**2)) decay_coefficient = paddle.min(decay_matrix / compensate_matrix, axis=0) elif self.kernel == 'linear': decay_matrix = (1 - decay_iou) / (1 - compensate_iou) decay_coefficient = paddle.min(decay_matrix, axis=0) else: raise NotImplementedError # update the score. cate_scores = cate_scores * decay_coefficient y = paddle.zeros(shape=paddle.shape(cate_scores), dtype='float32') keep = paddle.where(cate_scores >= self.update_threshold, cate_scores, y) keep = paddle.nonzero(keep) keep = paddle.squeeze(keep, axis=[1]) # Prevent empty and increase fake data keep = paddle.concat( [keep, paddle.cast(paddle.shape(cate_scores)[0] - 1, 'int64')]) seg_preds = paddle.gather(seg_preds, index=keep) cate_scores = paddle.gather(cate_scores, index=keep) cate_labels = paddle.gather(cate_labels, index=keep) # sort and keep top_k sort_inds = self._sort_score(cate_scores, self.post_nms_top_n) seg_preds = paddle.gather(seg_preds, index=sort_inds) cate_scores = paddle.gather(cate_scores, index=sort_inds) cate_labels = paddle.gather(cate_labels, index=sort_inds) return seg_preds, cate_scores, cate_labels
def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, fluid.framework.Block) block.program._use_lamb = True m = moment1 = self._get_accumulator(self._moment1_acc_str, param_and_grad[0]) v = self._get_accumulator(self._moment2_acc_str, param_and_grad[0]) beta_1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param_and_grad[0]) beta_2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, param_and_grad[0]) beta_1 = layers.fill_constant(dtype='float32', shape=[1], value=self._beta1, name='lamb_beta_1') beta_2 = layers.fill_constant(dtype='float32', shape=[1], value=self._beta2, name='lamb_beta_2') epsilon = layers.fill_constant(dtype='float32', shape=[1], value=self._epsilon, name='epsilon') one = paddle.ones(shape=[1]).astype('float32') zero = paddle.zeros(shape=[1]).astype('float32') next_m = paddle.multiply(m, beta_1) + paddle.multiply( param_and_grad[1], one - beta_1) next_v = paddle.multiply(v, beta_2) + paddle.multiply( paddle.pow(param_and_grad[1], 2), one - beta_2) beta1_correction = one - beta_1_pow_acc beta2_correction = one - beta_2_pow_acc next_m_unbiased = next_m / beta1_correction next_v_unbiased = next_v / beta2_correction update = next_m_unbiased / (paddle.sqrt(next_v_unbiased) + epsilon) if self._exclude_from_weight_decay_fn is not None and self._exclude_from_weight_decay_fn( param_and_grad[0]): self._lamb_weight_decay = 0.0 update += self._lamb_weight_decay * param_and_grad[0] w_norm = paddle.norm(param_and_grad[0], p=2) g_norm = paddle.norm(update, p=2) learning_rate = self._create_param_lr(param_and_grad) ratio = paddle.where( paddle.greater_than(w_norm, zero), paddle.where(paddle.greater_than(g_norm, zero), (w_norm / g_norm), one), one) update_with_lr = ratio * learning_rate * update next_param = param_and_grad[0] - update_with_lr beta_1_pow_acc *= beta_1 beta_2_pow_acc *= beta_2 paddle.assign(next_m, m) paddle.assign(next_v, v) paddle.assign(next_param, param_and_grad[0]) return None
def do_eval(args): paddle.set_device(args.device) model_class, tokenizer_class = MODEL_CLASSES["gpt"] tokenizer = tokenizer_class.from_pretrained(args.model_name) if args.init_checkpoint_path is not None: model = GPTForPretraining( GPTModel( **model_class.pretrained_init_configuration[args.model_name])) logger.info("Load model checkpoint from %s" % args.init_checkpoint_path) model_dict = paddle.load(os.path.join(args.init_checkpoint_path)) model.set_dict(model_dict) else: model = model_class.from_pretrained(args.model_name) tic_eval = time.time() eval_data_loader = create_eval_dataset(args) model.eval() total_score = 0 score_name = "loss" if not args.cloze_eval else "number correct" with paddle.no_grad(): for step, batch in enumerate(eval_data_loader): tokens, loss_mask, attention_mask, position_ids, labels = batch preds = model(tokens, position_ids, attention_mask) if not args.cloze_eval: masked_lm_loss = paddle.nn.functional.cross_entropy( preds, labels, reduction="none") loss = paddle.sum(masked_lm_loss * loss_mask) total_score += loss.numpy() / (args.num_tokenized_tokens - 1) else: outputs = paddle.argmax(preds, -1) acc = paddle.cast(outputs == labels, 'float32') acc = paddle.where(paddle.cast(loss_mask, 'bool'), acc, paddle.ones_like(acc)) acc = paddle.sum(paddle.prod(acc, -1)) total_score += acc.numpy() if step % args.logging_steps == 0: logger.info( "step %d, batch: %d, %s: %f, speed: %.2f step/s" % (step, step, score_name, total_score, args.logging_steps / (time.time() - tic_eval))) tic_eval = time.time() if not args.cloze_eval: total_loss = float(total_score) ppl = math.exp(min(20, total_loss)) token_ratio = (args.num_tokenized_tokens - 1) / (args.num_original_tokens - 1) adjusted_ppl = math.exp(min(20, total_loss * token_ratio)) string = ' validation results on {} | '.format(args.eval_path) string += 'avg loss: {:.4E} | '.format(total_loss) string += 'ppl: {:.4E} | '.format(ppl) string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl) string += 'token ratio: {} |'.format(token_ratio) else: num_correct = float(total_score) acc = float(num_correct / args.num_examples) string = ' validation results on {} | '.format(args.eval_path) string += 'number correct: {:.4E} | '.format(num_correct) string += 'total examples: {:.4E} | '.format(args.num_examples) string += 'avg accuracy: {:.4E}'.format(acc) logger.info(string)