def construct(self, logits, label): label = self.onehot(label, F.shape(logits)[-1], self.on_value, self.off_value) sigmiod_cross_entropy = self.sigmiod_cross_entropy(logits, label) sigmoid = self.sigmoid(logits) label = F.cast(label, mstype.float32) p_t = label * sigmoid + (1 - label) * (1 - sigmoid) modulating_factor = self.pow(1 - p_t, self.gamma) alpha_weight_factor = label * self.alpha + (1 - label) * (1 - self.alpha) focal_loss = modulating_factor * alpha_weight_factor * sigmiod_cross_entropy return focal_loss
def _tensor_apply_decay_with_sparse(weight_decay, if_apply, weight, gradient): """Get grad with weight_decay.""" if if_apply: indices = gradient.indices values = op_add( (op_gather(weight, indices, 0) * F.cast(weight_decay, F.dtype(weight)), gradient.values)) shape = gradient.dense_shape return RowTensor(indices, values, shape) return gradient
def construct(self, prob, halting_prob, n_updates): # zeros = self.zeros_like(halting_prob) # ones = self.ones_like(halting_prob) # Mask for inputs which have not halted last cy running = F.cast(halting_prob < 1.0, ms.float32) # running = self.select(halting_prob < 1.0,ones,zeros) # Add the halting probability for this step to the halting # probabilities for those input which haven't halted yet add_prob = prob * running new_prob = halting_prob + add_prob mask_run = F.cast(new_prob <= self.threshold, ms.float32) mask_halt = F.cast(new_prob > self.threshold, ms.float32) # mask_run = self.select(new_prob <= self.threshold,ones,zeros) # mask_halt = self.select(new_prob > self.threshold,ones,zeros) # Mask of inputs which haven't halted, and didn't halt this step still_running = mask_run * running running_prob = halting_prob + prob * still_running # Mask of inputs which halted at this step new_halted = mask_halt * running # Compute remainders for the inputs which halted at this step remainders = new_halted * (1.0 - running_prob) # Add the remainders to those inputs which halted at this step # halting_prob = new_prob + remainders dp = add_prob + remainders # Increment n_updates for all inputs which are still running # n_updates = n_updates + running dn = running # Compute the weight to be applied to the new state and output # 0 when the input has already halted # prob when the input hasn't halted yet # the remainders when it halted this step update_weights = prob * still_running + new_halted * remainders w = F.expand_dims(update_weights, -1) return w, dp, dn
def _convert_img_dtype_to_float32(img, max_val): """convert img dtype to float32""" # Ususally max_val is 1.0 or 255, we will do the scaling if max_val > 1. # We will scale img pixel value if max_val > 1. and just cast otherwise. ret = F.cast(img, mstype.float32) max_val = F.scalar_cast(max_val, mstype.float32) if max_val > 1.: scale = 1. / max_val ret = ret * scale return ret
def construct(self, x): input_shape = F.shape(x)[2:4] input_shape = F.cast(self.tenser_to_array(input_shape), ms.float32) big_object_output, medium_object_output, small_object_output = self.feature_map( x) output_big = self.detect_1(big_object_output, input_shape) output_me = self.detect_2(medium_object_output, input_shape) output_small = self.detect_3(small_object_output, input_shape) # big is the final output which has smallest feature map return output_big, output_me, output_small
def construct(self, x, query_hidden_state, input_mask, layer_past=None): input_x = self.layernorm1(x) input_x = F.cast(input_x, self.dtype) attention, layer_present = self.attention(input_x, query_hidden_state, input_mask, layer_past) if self.post_layernorm_residual: x = self.add(input_x, attention) else: x = self.add(x, attention) output_x = self.layernorm2(x) output_x = F.cast(output_x, self.dtype) mlp_logit = self.output(output_x) if self.post_layernorm_residual: output = self.last_add(output_x, mlp_logit) else: output = self.last_add(x, mlp_logit) return output, layer_present
def construct(self, input_ids): """evaluation net""" input_mask = F.cast(F.not_equal(input_ids, 0), mstype.float32) logits = self.backbone(input_ids, input_mask) outputs = None if self.generate: outputs = nn.LogSoftmax()(logits) outputs = F.tensor_pow(np.e, outputs) else: outputs = self.argmax(logits) return outputs
def _clip_grad(clip_type, clip_value, grad): """ Clip gradients. Inputs: clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'. clip_value (float): Specifies how much to clip. grad (tuple[Tensor]): Gradients. Outputs: tuple[Tensor], clipped gradients. """ if clip_type not in (0, 1): return grad dt = F.dtype(grad) if clip_type == 0: new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt), F.cast(F.tuple_to_array((clip_value,)), dt)) else: new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt)) return new_grad
def _tensors_cast_datatype(datatype, grad): """ Cast gradient to datatype. Args: datatype (mstype): the destination datatype of gradient. grad (Tensor): The gradient tensor before operation. Returns: Tensor, the gradient tensor after operation. """ return F.cast(grad, datatype)
def _tensors_cast_datatype(datatype, parameters): """ Cast parameters to datatype. Args: datatype (mstype): the destination datatype of parameters. parameters (Tensor): The parameters before operation. Returns: Tensor, the parameters after operation. """ return F.cast(parameters, datatype)
def construct(self, pred_label, gt_label, num_matched_boxes): gt_label = F.cast(gt_label, mstype.int32) mask = F.cast(self.less(0, gt_label), mstype.float32) gt_label_shape = F.shape(gt_label) pred_label = F.reshape(pred_label, (-1, self.num_classes)) gt_label = F.reshape(gt_label, (-1, )) cross_entropy = self.cross_entropy(pred_label, gt_label) cross_entropy = F.reshape(cross_entropy, gt_label_shape) # Hard example mining num_matched_boxes = F.reshape(num_matched_boxes, (-1, )) neg_masked_cross_entropy = F.cast(cross_entropy * (1 - mask), mstype.float16) _, loss_idx = self.sort_descend(neg_masked_cross_entropy, self.num_boxes) _, relative_position = self.sort(F.cast(loss_idx, mstype.float16), self.num_boxes) num_neg_boxes = self.minimum(num_matched_boxes * self.neg_pre_positive, self.num_boxes) tile_num_neg_boxes = self.tile(self.expand_dims(num_neg_boxes, -1), (1, self.num_boxes)) top_k_neg_mask = F.cast( self.less(relative_position, tile_num_neg_boxes), mstype.float32) class_loss = self.reduce_sum(cross_entropy * (mask + top_k_neg_mask), 1) return self.reduce_mean( class_loss / F.cast(num_matched_boxes, mstype.float32), 0)
def _attn(self, query, key, value, attention_mask): """ Get the weighted score along the seq_length Inputs: query: the query matrix key: the key matrix value: the value matrix attention_mask: the attention mask matrix with shape (batch_size, 1, seq_length, seq_length) Returns: weighted_values: Tensor, the weighted sum scores """ if not self.scale: query = query / F.cast(self.coeff, F.dtype(query)) key = key / F.cast(self.coeff, F.dtype(key)) score = self.batch_matmul(query, key) if self.scale: score = self.real_div( score, P.Cast()(self.scale_factor, P.DType()(score))) ori_dtype = P.DType()(score) score = P.Cast()(score, mstype.float32) multiplu_out = self.sub( P.Cast()(F.tuple_to_array((1.0,)), P.DType()(score)), P.Cast()(attention_mask, P.DType()(score))) adder = self.mul(multiplu_out, self.multiply_data) attention_scores = self.add(adder, score) shape = F.shape(attention_scores) attention_probs = self.softmax( F.reshape(attention_scores, (shape[0], -1, shape[-1]))) attention_probs = P.Cast()(attention_probs, ori_dtype) attention_probs = F.reshape(attention_probs, shape) attention_probs = self.prob_dropout(attention_probs) weighted_values = self.batch_matmul(attention_probs, value) return weighted_values
def _tensors_cast_datatype_with_sparse(datatype, grad): """ Cast gradient to datatype. Args: datatype (mstype): the destination datatype of gradient. grad (RowTensor): The gradient before operation. Returns: RowTensor, the gradient after operation. """ dout = F.cast(grad.values, datatype) return RowTensor(grad.indices, dout, grad.dense_shape)
def construct(self, x, hx): """construct""" x = F.cast(x, mstype.float16) if self.batch_first: x = self.transpose(x, (1, 0, 2)) # stack lstm h, c = hx hn = cn = None for i in range(self.num_layers): if self.bidirectional: x, (hn, cn) = self.lstm(x, h[i], c[i], self.weight_fw[i], self.bias_fw[i], self.weight_bw[i], self.bias_bw[i]) else: x, (hn, cn) = self.lstm(x, h[i], c[i], self.weight_fw[i], self.bias_fw[i]) if self.batch_first: x = self.transpose(x, (1, 0, 2)) x = F.cast(x, mstype.float32) hn = F.cast(x, mstype.float32) cn = F.cast(x, mstype.float32) return x, (hn, cn)
def construct(self, output_hm, output_wh, output_off, output_kps, hm, reg_mask, ind, wh, wight_mask, hm_offset, hps_mask, landmarks): """ Construct method. """ hm_loss = self.cls_loss(output_hm, hm) # 1. focal loss, center points wh_loss = self.reg_loss(output_wh, ind, wh, wight_mask) # 2. weight and height off_loss = self.reg_loss(output_off, ind, hm_offset, wight_mask) # 3. offset lm_loss = self.reg_loss_cmask(output_kps, hps_mask, ind, landmarks) # 4. landmark loss loss = self.hm_weight * hm_loss + self.wh_weight * wh_loss + \ self.off_weight * off_loss + self.lm_weight * lm_loss # depend is needed when wight_mask and reg_mask is not been used F.depend(loss, F.sqrt(F.cast(wight_mask, mstype.float32))) F.depend(loss, F.sqrt(F.cast(reg_mask, mstype.float32))) # add print when you want to see loss detail and do debug return loss
def _tensors_cast_datatype_with_sparse(datatype, grad): """ Cast gradient to datatype. Args: datatype (mstype): the destination datatype of gradient. grad (Tuple): The gradient tensor before operation. Returns: Tuple, the gradient tuple after operation. """ dout = F.cast(grad[1], datatype) return (grad[0], dout, grad[2])
def construct(self, data, coord_mask, conf_pos_mask, conf_neg_mask, cls_mask, t_coord, t_conf, t_cls, gt_list, coord_mask_1, conf_pos_mask_1, conf_neg_mask_1, cls_mask_1, t_coord_1, t_conf_1, t_cls_1, gt_list_1, coord_mask_2, conf_pos_mask_2, conf_neg_mask_2, cls_mask_2, t_coord_2, t_conf_2, t_cls_2, gt_list_2, sens=None): '''construct''' weights = self.weights loss = self.network(data, coord_mask, conf_pos_mask, conf_neg_mask, cls_mask, t_coord, t_conf, t_cls, gt_list, coord_mask_1, conf_pos_mask_1, conf_neg_mask_1, cls_mask_1, t_coord_1, t_conf_1, t_cls_1, gt_list_1, coord_mask_2, conf_pos_mask_2, conf_neg_mask_2, cls_mask_2, t_coord_2, t_conf_2, t_cls_2, gt_list_2) # init overflow buffer init = self.alloc_status() # clear overflow buffer init = F.depend(init, loss) clear_status = self.clear_status(init) if sens is None: scaling_sens = self.loss_scale else: scaling_sens = sens scaling_sens = F.depend(scaling_sens, clear_status) grads = self.grad(self.network, weights)(data, coord_mask, conf_pos_mask, conf_neg_mask, cls_mask, t_coord, t_conf, t_cls, gt_list, coord_mask_1, conf_pos_mask_1, conf_neg_mask_1, cls_mask_1, t_coord_1, t_conf_1, t_cls_1, gt_list_1, coord_mask_2, conf_pos_mask_2, conf_neg_mask_2, cls_mask_2, t_coord_2, t_conf_2, t_cls_2, gt_list_2, F.cast(scaling_sens, F.dtype(loss))) grads = self.hyper_map(F.partial(_grad_scale, scaling_sens), grads) if self.reducer_flag: grads = self.grad_reducer(grads) # get the overflow buffer init = F.depend(init, grads) get_status = self.get_status(init) init = F.depend(init, get_status) # sum overflow buffer elements, 0:not overflow , >0:overflow flag_sum = self.reduce_sum(init, (0,)) if self.is_distributed: # sum overflow flag over devices flag_reduce = self.allreduce(flag_sum) cond = self.less_equal(self.base, flag_reduce) else: cond = self.less_equal(self.base, flag_sum) opt = self.optimizer(grads) ret = (loss, cond, scaling_sens) return F.depend(ret, opt)
def construct(self, x): # VGG16 backbone: block1~5 block4, x = self.backbone(x) # SSD blocks: block6~7 x = self.b6_1(x) # 1024 x = self.b6_2(x) x = self.b7_1(x) # 1024 x = self.b7_2(x) block7 = x # Extra Feature Layers: block8~11 x = self.b8_1(x) # 256 x = self.b8_2(x) # 512 block8 = x x = self.b9_1(x) # 128 x = self.b9_2(x) # 256 block9 = x x = self.b10_1(x) # 128 x = self.b10_2(x) # 256 block10 = x x = self.b11_1(x) # 128 x = self.b11_2(x) # 256 block11 = x # boxes multi_feature = (block4, block7, block8, block9, block10, block11) pred_loc, pred_label = self.multi_box(multi_feature) if not self.training: pred_label = self.activation(pred_label) pred_loc = F.cast(pred_loc, mstype.float32) pred_label = F.cast(pred_label, mstype.float32) return pred_loc, pred_label
def construct(self, input_ids, input_position=None, attention_mask=None): tokens = self.slice(input_ids, (0, 0), (self.batch_size, -1), (1, 1)) input_position = self.slice(input_position, (0, 0), (self.batch_size, self.len), (1, 1)) attention_mask = self.slice_mask(attention_mask, (0, 0, 0), (self.batch_size, self.len, self.len), (1, 1, 1)) input_mask = F.cast(self.not_equal(tokens, self.eos_token), mstype.float32) logits = self.network(tokens, input_mask, input_position, attention_mask) labels = self.slice(input_ids, (0, 1), (self.batch_size, self.len + 1), (1, 1)) output = self.loss(logits, labels, input_mask) return output
def construct(self, input_ids, input_position, attention_mask): #tokens = input_ids[:, :-1] ret = None for i in range(self.micro_batch_step): micro_input, micro_input_position, micro_attention_mask = self.micro_input[i](input_ids, i, input_position, attention_mask) tokens = self.slice(micro_input, (0, 0), (self.batch_size // self.micro_batch_step, -1), (1, 1)) input_mask = F.cast(self.not_equal(tokens, self.eos_token), mstype.float32) logits = self.network(tokens, input_mask, micro_input_position, micro_attention_mask) labels = self.slice(micro_input, (0, 1), (self.batch_size // self.micro_batch_step, self.len + 1), (1, 1)) output = self.loss(logits, labels, input_mask) if ret is not None: ret = ret + output else: ret = output return ret
def construct(self, positions, forces, energy): outputs = self._network(positions) foutputs = -1 * self.grad_op(self._network)(positions) if self.add_cast_fp32: forces = F.mixed_precision_cast(ms.float32, forces) energy = F.mixed_precision_cast(ms.float32, energy) outputs = F.cast(outputs, ms.float32) if self._energy_fn is None: eloss = 0 else: eloss = self._energy_fn(outputs, energy) if self._force_fn is None: floss = 0 else: floss = self._force_fn(foutputs, forces) return eloss, floss, outputs, energy, foutputs, forces
def construct(self, x, h, c, w_f, b_f, w_b=None, b_b=None): """construct""" x = F.cast(x, mstype.float16) if self.bidirectional: y1, h1, c1, _, _, _, _, _ = self.dynamic_rnn( x, w_f, b_f, None, h[0], c[0]) r_x = self.reverseV2(x) y2, h2, c2, _, _, _, _, _ = self.dynamic_rnn( r_x, w_b, b_b, None, h[1], c[1]) y2 = self.reverseV2(y2) output = self.concat((y1, y2)) hn = self.concat((h1, h2)) cn = self.concat((c1, c2)) return output, (hn, cn) y1, h1, c1, _, _, _, _, _ = self.dynamic_rnn(x, w_f, b_f, None, h[0], c[0]) return y1, (h1, c1)
def construct(self, input_ids, input_mask, table, input_position, attention_mask, layer_past=None): """PANGUALPHA model""" if not self.use_past: layer_past = self.past hidden_states = self.pangu_alpha_embedding(input_ids, table, input_position) attention_mask = self.pangu_alpha_mask(input_mask, attention_mask) present_layer = () for i in range(self.num_layers-1): hidden_states, present = self.blocks[i](hidden_states, attention_mask, layer_past) present_layer = present_layer + (present,) top_query_hidden_states = self.top_query_embedding(input_position) hidden_states, present = self.blocks[self.num_layers-1](hidden_states, top_query_hidden_states, attention_mask, layer_past) present_layer = present_layer + (present,) output_state = self.layernorm(hidden_states) output_state = F.cast(output_state, self.dtype) return output_state, present_layer
def _tensors_allreduce_post(degree, mean, allreduce_filter, grad): """ Apply allreduce on gradient in PyNative mode. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce (Primitive): The communication operator for gradients. allreduce_filter (bool): When it is true, allreduce would apply. grad (Tensor): The gradient tensor before operation. Returns: Tensor, the gradient tensor after operation. """ if allreduce_filter: if mean: grad = F.tensor_mul(grad, F.cast(degree, F.dtype(grad))) return grad return grad
def __init__(self, tot_atoms): super().__init__() # tot_atoms: A # tot_neigh: N = A - 1 tot_neigh = tot_atoms - 1 arange = nn.Range(tot_atoms) nrange = nn.Range(tot_neigh) self.ones = P.Ones() self.aones = self.ones((tot_atoms), ms.int32) self.nones = self.ones((tot_neigh), ms.int32) # neighbors for no connection (A*N) # [[0,0,...,0], # [1,1,...,1], # ..........., # [N,N,...,N]] self.nnc = F.expand_dims(arange(), -1) * self.nones # copy of the index range (A*N) # [[0,1,...,N-1], # [0,1,...,N-1], # ..........., # [0,1,...,N-1]] crange = self.ones((tot_atoms, 1), ms.int32) * nrange() # neighbors for full connection (A*N) # [[1,2,3,...,N], # [0,2,3,...,N], # [0,1,3,....N], # ............., # [0,1,2,...,N-1]] self.nfc = crange + F.cast(self.nnc <= crange, ms.int32) crange1 = crange + 1 # the matrix for index range (A*N) # [[1,2,3,...,N], # [1,2,3,...,N], # [2,2,3,....N], # [3,3,3,....N], # ............., # [N,N,N,...,N]] self.mat_idx = F.select(crange1 > self.nnc, crange1, self.nnc)
def construct(self, logits, labels): logits = self.transpose(logits, (0, 2, 3, 1)) logits = self.reshape(logits, (-1, self.num)) labels = F.cast(labels, mstype.int32) labels = self.reshape(labels, (-1, )) one_hot_labels = self.one_hot(labels) losses = self.cross_entropy(logits, one_hot_labels)[0] weights = self.cast(self.not_equal(labels, self.ignore_label), mstype.float32) * self.loss_weight weighted_losses = self.mul(losses, weights) loss = self.reduce_sum(weighted_losses, (0, )) zeros = self.fill(mstype.float32, self.shape(weights), 0.0) ones = self.fill(mstype.float32, self.shape(weights), 1.0) present = self.select(self.equal(weights, zeros), zeros, ones) present = self.reduce_sum(present, (0, )) zeros = self.fill(mstype.float32, self.shape(present), 0.0) min_control = self.fill(mstype.float32, self.shape(present), 1.0) present = self.select(self.equal(present, zeros), min_control, present) loss = loss / present return loss
def construct(self, x1, x2, y): F.same_type_shape(x1, x2) _check_reduced_shape_valid(F.shape(x1), F.shape(y), (1,), self.cls_name) # if target > 0, 1-cosine(x1, x2) # else, max(0, cosine(x1, x2)-margin) np_eps = const_utils.get_np_eps(F.dtype(x1)) eps = F.cast(np_eps, F.dtype(x1)) prod_sum = self.reduce_sum(x1 * x2, (1,)) square1 = self.reduce_sum(F.square(x1), (1,)) + eps square2 = self.reduce_sum(F.square(x2), (1,)) + eps denom = F.sqrt(square1 * square2) cosine = prod_sum / denom pos_value = 1.0 - cosine neg_value = self.maximum(cosine - self.margin, 0.0) zeros = F.zeros_like(cosine) pos_part = F.select(y == 1, pos_value, zeros) neg_part = F.select(y == -1, neg_value, zeros) output_unreduced = pos_part + neg_part return self.get_loss(output_unreduced)
def construct(self, logits, label, input_mask): logits = F.cast(logits, mstype.float32) _, logit_max = self.max(logits) logit_sub = self.sub(logits, logit_max) logit_exp = self.exp(logit_sub) exp_sum = self.sum(logit_exp, -1) exp_sum = P.Reshape()(exp_sum, (F.shape(exp_sum)[0], 1)) softmax_result = self.div(logit_exp, exp_sum) log_softmax_result = self.log(self.add(softmax_result, self.eps_const)) label = P.Reshape()(label, (-1,)) one_hot_label = self.onehot(label, self.vocab_size, self.on_value, self.off_value) loss = self.mul(log_softmax_result, one_hot_label) loss_unsum = self.neg(loss) loss_reduce = self.sum(loss_unsum, -1) input_mask = P.Reshape()(input_mask, (-1,)) numerator = self.sum2(self.mul2(loss_reduce, input_mask)) denominator = self.add2( self.sum2(input_mask), P.Cast()(F.tuple_to_array((1e-5,)), mstype.float32)) loss = self.div2(numerator, denominator) return loss
def __init__(self, tot_atoms): super().__init__() # tot_atoms: A # tot_neigh: N = A - 1 tot_neigh = tot_atoms - 1 arange = nn.Range(tot_atoms) nrange = nn.Range(tot_neigh) self.ones = P.Ones() self.aones = self.ones((tot_atoms), ms.int32) self.nones = self.ones((tot_neigh), ms.int32) self.eaones = F.expand_dims(self.aones, -1) # neighbors for no connection (A*N) # [[0,0,...,0], # [1,1,...,1], # ..........., # [N,N,...,N]] self.nnc = F.expand_dims(arange(), -1) * self.nones # copy of the index range (A*N) # [[0,1,...,N-1], # [0,1,...,N-1], # ..........., # [0,1,...,N-1]] exrange = self.ones((tot_atoms, 1), ms.int32) * nrange() # neighbors for full connection (A*N) # [[1,2,3,...,N], # [0,2,3,...,N], # [0,1,3,....N], # ............., # [0,1,2,...,N-1]] self.nfc = exrange + F.cast(self.nnc <= exrange, ms.int32) self.ar0 = nn.Range(0, tot_neigh)() self.ar1 = nn.Range(1, tot_atoms)()
def _tensors_allreduce_with_sparse(degree, mean, allgather, allreduce, allreduce_filter, grad): """ Apply allgather on gradient instead of allreduce for sparse feature. Allgather is a communication operation used for distributed deep learning. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce (Primitive): The communication operator for gradients. allreduce_filter (bool): When it is true, allgather would apply. grad (tuple): The indices, gradient tensor and tensor_shape before operation. Returns: RowTensor, the gradient after operation. """ if allreduce_filter: indices = allgather(grad.indices) dout = allgather(grad.values) if mean: dout = F.tensor_mul(dout, F.cast(degree, F.dtype(dout))) grad = RowTensor(indices, dout, grad.dense_shape) return grad