def construct(self, gradients, overflow): """AdamWeightDecayForBert""" lr = self.get_lr() cond = self.op_cast(F.fill(mstype.int32, self.op_shape(self.beta1), 1) *\ self.op_reshape(overflow, (())), mstype.bool_) beta1 = self.op_select( cond, self.op_cast(F.tuple_to_array((1.0, )), mstype.float32), self.beta1) beta2 = self.op_select( cond, self.op_cast(F.tuple_to_array((1.0, )), mstype.float32), self.beta2) if self.is_group: if self.is_group_lr: optim_result = self.hyper_map( F.partial(_adam_opt, self.beta1, self.beta2, self.eps), lr, self.weight_decay, self.parameters, self.moments1, self.moments2, gradients, self.decay_flags, self.optim_filter) else: optim_result = self.hyper_map( F.partial(_adam_opt, beta1, beta2, self.eps, lr, overflow), self.weight_decay, self.parameters, self.moments1, self.moments2, gradients, self.decay_flags, self.optim_filter) else: optim_result = self.hyper_map( F.partial(_adam_opt, self.beta1, self.beta2, self.eps, lr, self.weight_decay), self.parameters, self.moments1, self.moments2, gradients, self.decay_flags, self.optim_filter) if self.use_parallel: self.broadcast_params(optim_result) return optim_result
def _update_run_op_for_map(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, gradient, decay_flag): op_mul = P.Mul() op_square = P.Square() op_sqrt = P.Sqrt() op_cast = P.Cast() op_reshape = P.Reshape() op_shape = P.Shape() param_fp32 = op_cast(param, mstype.float32) m_fp32 = op_cast(m, mstype.float32) v_fp32 = op_cast(v, mstype.float32) gradient_fp32 = op_cast(gradient, mstype.float32) next_m = op_mul(beta1, m_fp32) + op_mul( op_cast(F.tuple_to_array( (1.0, )), mstype.float32) - beta1, gradient_fp32) next_v = op_mul(beta2, v_fp32) + op_mul( op_cast(F.tuple_to_array( (1.0, )), mstype.float32) - beta2, op_square(gradient_fp32)) update = next_m / (op_sqrt(next_v) + eps) if decay_flag: update = update + op_mul(weight_decay_tensor, param_fp32) update_with_lr = op_mul(lr, update) next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32)) next_v = F.depend(next_v, F.assign(param, next_param)) next_v = F.depend(next_v, F.assign(m, next_m)) next_v = F.depend(next_v, F.assign(v, next_v)) return next_v
def construct(self, grads, clip_type, clip_value): """ Construct gradient clip network. Args: grads (list): List of gradient tuples. clip_type (Tensor): The way to clip, 'value' or 'norm'. clip_value (Tensor): Specifies how much to clip. Returns: List, a list of clipped_grad tuples. """ if clip_type != 0 and clip_type != 1: # pylint: disable=R1714 return grads new_grads = () for grad in grads: dt = self.dtype(grad) if clip_type == 0: t = C.clip_by_value(grad, self.cast(F.tuple_to_array((-clip_value,)), dt), self.cast(F.tuple_to_array((clip_value,)), dt)) else: t = self.clip_by_norm(grad, self.cast(F.tuple_to_array((clip_value,)), dt)) new_grads = new_grads + (t,) return new_grads
def construct(self, x): num_batch = P.Shape()(x)[0] grid_size = P.Shape()(x)[2:4] # Reshape and transpose the feature to [n, 3, grid_size[0], grid_size[1], num_attrib] prediction = P.Reshape()(x, (num_batch, self.num_anchors_per_scale, self.num_attrib, grid_size[0], grid_size[1])) prediction = P.Transpose()(prediction, (0, 3, 4, 1, 2)) range_x = range(grid_size[1]) range_y = range(grid_size[0]) grid_x = P.Cast()(F.tuple_to_array(range_x), ms.float32) grid_y = P.Cast()(F.tuple_to_array(range_y), ms.float32) # Tensor of shape [grid_size[0], grid_size[1], 1, 1] representing the coordinate of x/y axis for each grid grid_x = self.tile(self.reshape(grid_x, (1, 1, -1, 1, 1)), (1, grid_size[0], 1, 1, 1)) grid_y = self.tile(self.reshape(grid_y, (1, -1, 1, 1, 1)), (1, 1, grid_size[1], 1, 1)) # Shape is [grid_size[0], grid_size[1], 1, 2] grid = self.concat((grid_x, grid_y)) box_xy = prediction[:, :, :, :, :2] box_wh = prediction[:, :, :, :, 2:4] box_confidence = prediction[:, :, :, :, 4:5] box_probs = prediction[:, :, :, :, 5:] box_xy = (self.sigmoid(box_xy) + grid) / P.Cast()(F.tuple_to_array((grid_size[1], grid_size[0])), ms.float32) box_wh = P.Exp()(box_wh) * self.anchors / self.input_shape box_confidence = self.sigmoid(box_confidence) box_probs = self.sigmoid(box_probs) if self.training: return grid, prediction, box_xy, box_wh return box_xy, box_wh, box_confidence, box_probs
def _update_run_op(beta1, beta2, eps, lr, weight_decay, param, m, v, gradient, decay_flag, optim_filter): """ Update parameters. Args: beta1 (Tensor): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0). beta2 (Tensor): The exponential decay rate for the 2nd moment estimations. Should be in range (0.0, 1.0). eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. lr (Tensor): Learning rate. weight_decay (Number): Weight decay. Should be equal to or greater than 0. param (Tensor): Parameters. m (Tensor): m value of parameters. v (Tensor): v value of parameters. gradient (Tensor): Gradient of parameters. decay_flag (bool): Applies weight decay or not. optim_filter (bool): Applies parameter update or not. Returns: Tensor, the new value of v after updating. """ if optim_filter: op_mul = P.Mul() op_square = P.Square() op_sqrt = P.Sqrt() op_cast = P.Cast() op_reshape = P.Reshape() op_shape = P.Shape() param_fp32 = op_cast(param, mstype.float32) m_fp32 = op_cast(m, mstype.float32) v_fp32 = op_cast(v, mstype.float32) gradient_fp32 = op_cast(gradient, mstype.float32) next_m = op_mul(beta1, m_fp32) + op_mul( op_cast(F.tuple_to_array( (1.0, )), mstype.float32) - beta1, gradient_fp32) next_v = op_mul(beta2, v_fp32) + op_mul( op_cast(F.tuple_to_array( (1.0, )), mstype.float32) - beta2, op_square(gradient_fp32)) update = next_m / (eps + op_sqrt(next_v)) if decay_flag: update = op_mul(weight_decay, param_fp32) + update update_with_lr = op_mul(lr, update) next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32)) next_param = F.depend( next_param, F.assign(param, op_cast(next_param, F.dtype(param)))) next_param = F.depend(next_param, F.assign(m, op_cast(next_m, F.dtype(m)))) next_param = F.depend(next_param, F.assign(v, op_cast(next_v, F.dtype(v)))) return op_cast(next_param, F.dtype(param)) return gradient
def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, param, m, v, ps_parameter, cache_enable): """Apply sparse adam optimizer to the weight parameter when the gradient is sparse.""" success = True indices = gradient.indices values = gradient.values if ps_parameter and not cache_enable: op_shape = P.Shape() shapes = (op_shape(param), op_shape(m), op_shape(v), op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1), op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices)) success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices), shapes), param)) return success if not target: success = F.depend(success, sparse_opt(param, m, v, beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices)) else: op_mul = P.Mul() op_square = P.Square() op_sqrt = P.Sqrt() scatter_add = P.ScatterAdd(use_locking) success = F.depend(success, F.assign(m, op_mul(beta1, m))) success = F.depend(success, F.assign(v, op_mul(beta2, v))) grad_indices = gradient.indices grad_value = gradient.values next_m = scatter_add(m, grad_indices, op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value)) next_v = scatter_add(v, grad_indices, op_mul(F.tuple_to_array((1.0,)) - beta2, op_square(grad_value))) if use_nesterov: m_temp = next_m * _scaler_ten F.assign(m, op_mul(beta1, next_m)) div_value = scatter_add(m, op_mul(grad_indices, _scaler_one), op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value)) param_update = div_value / (op_sqrt(next_v) + eps) F.assign(m, m_temp / _scaler_ten) else: param_update = next_m / (op_sqrt(next_v) + eps) lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power) next_param = param - lr_t * param_update success = F.depend(success, F.assign(param, next_param)) success = F.depend(success, F.assign(m, next_m)) success = F.depend(success, F.assign(v, next_v)) return success
def construct(self, grads, clip_min, clip_max): new_grads = () for grad in grads: dt = self.dtype(grad) t = C.clip_by_value(grad, self.cast(F.tuple_to_array((clip_min,)), dt), self.cast(F.tuple_to_array((clip_max,)), dt)) t = self.cast(t, dt) new_grads = new_grads + (t,) return new_grads
def _clip_grad(clip_type, clip_value, grad): dt = F.dtype(grad) if clip_type == 0: new_grad = C.clip_by_value( grad, F.cast(F.tuple_to_array((-clip_value, )), dt), F.cast(F.tuple_to_array((clip_value, )), dt)) else: new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value, )), dt)) return new_grad
def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, gradient, decay_flag): """ Update parameters. Args: beta1 (Tensor): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0). beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0). eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. lr (Tensor): Learning rate. weight_decay_tensor (Tensor): Weight decay. Should be equal to or greater than 0. param (Tensor): Parameters. m (Tensor): m value of parameters. v (Tensor): v value of parameters. gradient (Tensor): Gradient of parameters. Returns: Tensor, the new value of v after updating. """ op_mul = P.Mul() op_square = P.Square() op_sqrt = P.Sqrt() op_cast = P.Cast() op_reshape = P.Reshape() op_shape = P.Shape() param = op_cast(param, mstype.float32) m = op_cast(m, mstype.float32) v = op_cast(v, mstype.float32) gradient = op_cast(gradient, mstype.float32) next_m = op_mul(beta1, m) + op_mul( op_cast(F.tuple_to_array((1.0, )), mstype.float32) - beta1, gradient) next_v = op_mul(beta2, v) + op_mul( op_cast(F.tuple_to_array( (1.0, )), mstype.float32) - beta2, op_square(gradient)) update = next_m / (op_sqrt(next_v) + eps) if decay_flag: update = update + op_mul(weight_decay_tensor, param) update_with_lr = op_mul(lr, update) next_param = param - op_reshape(update_with_lr, op_shape(param)) next_v = F.depend(next_v, F.assign(param, next_param)) next_v = F.depend(next_v, F.assign(m, next_m)) next_v = F.depend(next_v, F.assign(v, next_v)) return next_v
def construct( self, source_eos_ids, source_eos_mask, target_sos_ids, target_sos_mask, target_eos_ids, target_eos_mask, ): """Defines the computation performed.""" source_ids = source_eos_ids source_mask = source_eos_mask target_ids = target_sos_ids target_mask = target_sos_mask label_ids = target_eos_ids label_weights = target_eos_mask weights = self.weights loss = self.network(source_ids, source_mask, target_ids, target_mask, label_ids, label_weights) grads = self.grad(self.network, weights)(source_ids, source_mask, target_ids, target_mask, label_ids, label_weights, self.cast(F.tuple_to_array((self.sens, )), mstype.float32)) grads = self.clip_gradients(grads, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE) if self.reducer_flag: # apply grad reducer on grads grads = self.grad_reducer(grads) succ = self.optimizer(grads) return F.depend(loss, succ)
def _attn(self, query, key, value, attention_mask): """ Get the weighted score along the seq_length Inputs: query: the query matrix key: the key matrix value: the value matrix attention_mask: the attention mask matrix with shape (batch_size, 1, seq_length, seq_length) Returns: weighted_values: Tensor, the weighted sum scores """ if not self.scale: query = query / F.cast(self.coeff, F.dtype(query)) key = key / F.cast(self.coeff, F.dtype(key)) score = self.batch_matmul(query, key) if self.scale: score = score / P.Cast()(self.scale_factor, P.DType()(score)) ori_dtype = P.DType()(score) score = P.Cast()(score, mstype.float32) multiplu_out = P.Sub()(P.Cast()(F.tuple_to_array((1.0,)), P.DType()(score)), P.Cast()(attention_mask, P.DType()(score))) adder = P.Mul()(multiplu_out, self.multiply_data) attention_scores = adder + score attention_scores = P.Cast()(attention_scores, ori_dtype) attention_probs = Softmax()(attention_scores) attention_probs = self.prob_dropout(attention_probs) weighted_values = self.batch_matmul(attention_probs, value) return weighted_values
def construct(self, input_ids, input_mask, input_position=None, attention_mask=None, layer_past=None): """PanGu Alpha model""" if not self.use_past: layer_past = self.past input_embedding, embedding_table = self.word_embedding(input_ids) if not self.eod_reset: batch_size, seq_length = F.shape(input_ids) input_position = F.tuple_to_array(F.make_range(seq_length)) input_position = P.Tile()(input_position, (batch_size, 1)) attention_mask = self.get_attention_mask(input_mask) position_embedding = self.position_embedding(input_position) hidden_states = self.add(input_embedding, position_embedding) hidden_states = self.dropout(hidden_states) hidden_states = P.Cast()(hidden_states, mstype.float16) attention_mask = self.expand_dims(attention_mask, 1) present_layer = () for i in range(self.num_layers): hidden_states, present = self.blocks[i](hidden_states, attention_mask, layer_past) present_layer = present_layer + (present,) output_state = self.layernorm(hidden_states) output_state = F.cast(output_state, self.dtype) top_query_hidden_states = self.top_query_embedding(input_position) output_state, present = self.top_query_layer(output_state, top_query_hidden_states, attention_mask, layer_past) present_layer = present_layer + (present,) return output_state, present_layer, embedding_table
def construct(self, input_ids, input_mask, layer_past=None): """GPT model""" if not self.use_past: layer_past = self.past input_embedding, embedding_table = self.word_embedding(input_ids) batch_size, seq_length = F.shape(input_ids) input_position = F.tuple_to_array(F.make_range(seq_length)) input_position = P.Tile()(input_position, (batch_size, 1)) position_embedding = self.position_embedding(input_position) hidden_states = input_embedding + position_embedding hidden_states = P.Cast()(hidden_states, mstype.float16) attention_mask = self.get_attention_mask(input_mask) attention_mask = P.ExpandDims()(attention_mask, 1) present_layer = () for i in range(self.num_layers): hidden_states, present = self.blocks[i](hidden_states, attention_mask, layer_past) present_layer = present_layer + (present, ) output_state = self.layernorm(hidden_states) return output_state, present_layer, embedding_table
def construct(self, input_ids, input_mask, token_type_id, next_sentence_labels, masked_lm_positions, masked_lm_ids, masked_lm_weights): """Defines the computation performed.""" weights = self.weights loss = self.network(input_ids, input_mask, token_type_id, next_sentence_labels, masked_lm_positions, masked_lm_ids, masked_lm_weights) grads = self.grad(self.network, weights)(input_ids, input_mask, token_type_id, next_sentence_labels, masked_lm_positions, masked_lm_ids, masked_lm_weights, self.cast(F.tuple_to_array((self.sens,)), mstype.float32)) grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) grads = self.grad_reducer(grads) succ = self.optimizer(grads) return F.depend(loss, succ)
def construct(self, input_ids, input_mask, token_type_id, label_ids): """Defines the computation performed.""" weights = self.weights for i in range(self.length): F.assign(self.saved_params[i], weights[i]) for i in range(self.quant_embedding_list_length): quant_embedding = self.quantize_embedding( weights[self.quant_embedding_list[i]]) F.assign(weights[self.quant_embedding_list[i]], quant_embedding) for i in range(self.quant_weight_list_length): quant_weight = self.quantize_weight( weights[self.quant_weight_list[i]]) F.assign(weights[self.quant_weight_list[i]], quant_weight) grads = self.grad(self.network, weights)(input_ids, input_mask, token_type_id, label_ids, self.cast(F.tuple_to_array((self.sens, )), mstype.float32)) # apply grad reducer on grads grads = self.grad_reducer(grads) grads = self.hyper_map( F.partial(clip_grad, self.clip_type, self.clip_value), grads) for i in range(self.length): param = F.depend(self.saved_params[i], grads) F.assign(weights[i], param) succ = self.optimizer(grads) return succ
def construct(self, grads, clip_type, clip_value): """clip gradients""" if clip_type not in (0, 1): return grads new_grads = () for grad in grads: dt = self.dtype(grad) if clip_type == 0: t = C.clip_by_value( grad, self.cast(F.tuple_to_array((-clip_value, )), dt), self.cast(F.tuple_to_array((clip_value, )), dt)) else: t = self.clip_by_norm( grad, self.cast(F.tuple_to_array((clip_value, )), dt)) new_grads = new_grads + (t, ) return new_grads
def construct(self, prediction_scores, label_ids, label_weights): """ Construct network to calculate loss. Args: prediction_scores (Tensor): Prediction scores. label_ids (Tensor): Labels. label_weights (Tensor): Mask tensor. Returns: Tensor, final loss. """ label_shape = self.get_shape(label_ids) label_ids = self.reshape(label_ids, (label_shape[0] * label_shape[1],)) label_weights = self.cast( self.reshape(label_weights, (label_shape[0] * label_shape[1],)), mstype.float32 ) one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value) per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx)) numerator = self.reduce_sum(label_weights * per_example_loss, ()) denominator = self.reduce_sum(label_weights, ()) + self.cast(F.tuple_to_array((1e-5,)), mstype.float32) loss = numerator / denominator return loss
def construct(self, prediction_scores, seq_relationship_score, masked_lm_ids, masked_lm_weights, next_sentence_labels): """Defines the computation performed.""" label_ids = self.reshape(masked_lm_ids, self.last_idx) label_weights = self.cast( self.reshape(masked_lm_weights, self.last_idx), mstype.float32) one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value) per_example_loss = self.neg( self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx)) numerator = self.reduce_sum(label_weights * per_example_loss, ()) denominator = self.reduce_sum(label_weights, ()) + self.cast( F.tuple_to_array((1e-5, )), mstype.float32) masked_lm_loss = numerator / denominator # next_sentence_loss labels = self.reshape(next_sentence_labels, self.last_idx) one_hot_labels = self.onehot(labels, 2, self.on_value, self.off_value) per_example_loss = self.neg( self.reduce_sum(one_hot_labels * seq_relationship_score, self.last_idx)) next_sentence_loss = self.reduce_mean(per_example_loss, self.last_idx) # total_loss total_loss = masked_lm_loss + next_sentence_loss return total_loss
def construct(self, prediction, pred_xy, pred_wh, y_true, gt_box, input_shape): """ prediction : origin output from yolo pred_xy: (sigmoid(xy)+grid)/grid_size pred_wh: (exp(wh)*anchors)/input_shape y_true : after normalize gt_box: [batch, maxboxes, xyhw] after normalize """ object_mask = y_true[:, :, :, :, 4:5] class_probs = y_true[:, :, :, :, 5:] true_boxes = y_true[:, :, :, :, :4] grid_shape = P.Shape()(prediction)[1:3] grid_shape = P.Cast()(F.tuple_to_array(grid_shape[::-1]), ms.float32) pred_boxes = self.concat((pred_xy, pred_wh)) true_wh = y_true[:, :, :, :, 2:4] true_wh = P.Select()(P.Equal()(true_wh, 0.0), P.Fill()(P.DType()(true_wh), P.Shape()(true_wh), 1.0), true_wh) true_wh = P.Log()(true_wh / self.anchors * input_shape) # 2-w*h for large picture, use small scale, since small obj need more precise box_loss_scale = 2 - y_true[:, :, :, :, 2:3] * y_true[:, :, :, :, 3:4] gt_shape = P.Shape()(gt_box) gt_box = P.Reshape()(gt_box, (gt_shape[0], 1, 1, 1, gt_shape[1], gt_shape[2])) # add one more dimension for broadcast iou = self.iou(P.ExpandDims()(pred_boxes, -2), gt_box) # gt_box is x,y,h,w after normalize # [batch, grid[0], grid[1], num_anchor, num_gt] best_iou = self.reduce_max(iou, -1) # [batch, grid[0], grid[1], num_anchor] # ignore_mask IOU too small ignore_mask = best_iou < self.ignore_threshold ignore_mask = P.Cast()(ignore_mask, ms.float32) ignore_mask = P.ExpandDims()(ignore_mask, -1) # ignore_mask backpro will cause a lot maximunGrad and minimumGrad time consume. # so we turn off its gradient ignore_mask = F.stop_gradient(ignore_mask) confidence_loss = self.confidence_loss(object_mask, prediction[:, :, :, :, 4:5], ignore_mask) class_loss = self.class_loss(object_mask, prediction[:, :, :, :, 5:], class_probs) object_mask_me = P.Reshape()(object_mask, (-1, 1)) # [8, 72, 72, 3, 1] box_loss_scale_me = P.Reshape()(box_loss_scale, (-1, 1)) pred_boxes_me = xywh2x1y1x2y2(pred_boxes) pred_boxes_me = P.Reshape()(pred_boxes_me, (-1, 4)) true_boxes_me = xywh2x1y1x2y2(true_boxes) true_boxes_me = P.Reshape()(true_boxes_me, (-1, 4)) ciou = self.giou(pred_boxes_me, true_boxes_me) ciou_loss = object_mask_me * box_loss_scale_me * (1 - ciou) ciou_loss_me = self.reduce_sum(ciou_loss, ()) loss = ciou_loss_me * 10 + confidence_loss + class_loss batch_size = P.Shape()(prediction)[0] return loss / batch_size
def construct(self, grads, clip_type, clip_value): """ construct a compute flow. """ # pylint: disable=consider-using-in if clip_type != 0 and clip_type != 1: return grads new_grads = () for grad in grads: if clip_type == 0: t = C.clip_by_value(grad, F.tuple_to_array((-clip_value,)), F.tuple_to_array((clip_value,))) else: t = self.clip_by_norm(grad, F.tuple_to_array((clip_value,))) new_grads = new_grads + (t,) return new_grads
def construct(self, grads, clip_type, clip_value): # return grads if clip_type != 0 and clip_type != 1: return grads new_grads = () for grad in grads: dt = self.dtype(grad) if clip_type == 0: t = C.clip_by_value( grad, self.cast(F.tuple_to_array((-clip_value, )), dt), self.cast(F.tuple_to_array((clip_value, )), dt)) else: t = self.clip_by_norm( grad, self.cast(F.tuple_to_array((clip_value, )), dt)) new_grads = new_grads + (t, ) return new_grads
def construct(self, logits, label, input_mask): logits = self.log_softmax(P.Cast()(logits, mstype.float32)) label = P.Reshape()(label, (-1,)) one_hot_label = self.onehot(label, self.vocab_size, self.on_value, self.off_value) loss_sum = P.Neg()(self.sum(logits*one_hot_label, (-1,))) input_mask = P.Reshape()(input_mask, (-1,)) numerator = self.sum(loss_sum*input_mask) denominator = self.sum(input_mask) + P.Cast()(F.tuple_to_array((1e-5,)), mstype.float32) loss = numerator / denominator return loss
def construct(self, input_tensor): attention_scores = input_tensor attention_scores = self.cast(attention_scores, mstype.float32) if self.has_attention_mask: attention_mask = self.expand_dims(self.attention_mask, 1) multiply_out = self.sub(self.cast(F.tuple_to_array((1.0,)), mstype.float32), self.cast(attention_mask, self.get_dtype(attention_scores))) adder = self.multiply(multiply_out, self.multiply_data) attention_scores = self.add(adder, attention_scores) return attention_scores
def construct(self, x, input_shape): """construct method""" num_batch = P.Shape()(x)[0] grid_size = P.Shape()(x)[2:4] # Reshape and transpose the feature to [n, grid_size[0], grid_size[1], 3, num_attrib] prediction = P.Reshape()(x, (num_batch, self.num_anchors_per_scale, self.num_attrib, grid_size[0], grid_size[1])) prediction = P.Transpose()(prediction, (0, 3, 4, 1, 2)) range_x = range(grid_size[1]) range_y = range(grid_size[0]) grid_x = P.Cast()(F.tuple_to_array(range_x), ms.float32) grid_y = P.Cast()(F.tuple_to_array(range_y), ms.float32) # Tensor of shape [grid_size[0], grid_size[1], 1, 1] representing the coordinate of x/y axis for each grid # [batch, gridx, gridy, 1, 1] grid_x = self.tile(self.reshape(grid_x, (1, 1, -1, 1, 1)), (1, grid_size[0], 1, 1, 1)) grid_y = self.tile(self.reshape(grid_y, (1, -1, 1, 1, 1)), (1, 1, grid_size[1], 1, 1)) # Shape is [grid_size[0], grid_size[1], 1, 2] grid = self.concat((grid_x, grid_y)) box_xy = prediction[:, :, :, :, :2] box_wh = prediction[:, :, :, :, 2:4] box_confidence = prediction[:, :, :, :, 4:5] box_probs = prediction[:, :, :, :, 5:] # gridsize1 is x # gridsize0 is y box_xy = (self.scale_x_y * self.sigmoid(box_xy) - self.offset_x_y + grid) / \ P.Cast()(F.tuple_to_array((grid_size[1], grid_size[0])), ms.float32) # box_wh is w->h box_wh = P.Exp()(box_wh) * self.anchors / input_shape box_confidence = self.sigmoid(box_confidence) box_probs = self.sigmoid(box_probs) if self.conf_training: return prediction, box_xy, box_wh return self.concat((box_xy, box_wh, box_confidence, box_probs))
def construct(self, beta1, beta2, gradient, eps, weight_decay_tensor, lr): param_fp32 = self.op_cast(self.param, mstype.float32) m_fp32 = self.op_cast(self.m, mstype.float32) v_fp32 = self.op_cast(self.v, mstype.float32) gradient_fp32 = self.op_cast(gradient, mstype.float32) next_m = self.op_mul(beta1, m_fp32) + \ self.op_mul(self.op_cast(F.tuple_to_array((1.0,)), mstype.float32) - beta1, gradient_fp32) next_v = self.op_mul(beta2, v_fp32) + self.op_mul(self.op_cast(F.tuple_to_array((1.0,)), mstype.float32) - \ beta2, self.op_square(gradient_fp32)) update = next_m / (eps + self.op_sqrt(next_v)) if self.decay_flag: update = self.op_mul(weight_decay_tensor, param_fp32) + update update_with_lr = self.op_mul(lr, update) next_param = param_fp32 - self.op_reshape(update_with_lr, self.op_shape(param_fp32)) next_v = F.depend(next_v, F.assign(self.param, next_param)) next_v = F.depend(next_v, F.assign(self.m, next_m)) next_v = F.depend(next_v, F.assign(self.v, next_v)) return next_v
def construct(self, prediction_scores, label_ids, label_weights): """Defines the computation performed.""" label_ids = self.reshape(label_ids, self.flat_shape) label_weights = self.cast(self.reshape(label_weights, self.flat_shape), mstype.float32) one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value) per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx)) numerator = self.reduce_sum(label_weights * per_example_loss, ()) denominator = self.reduce_sum(label_weights, ()) + \ self.cast(F.tuple_to_array((1e-5,)), mstype.float32) loss = numerator / denominator return loss
def _clip_grad(clip_type, clip_value, grad): """ Clip gradients. Inputs: clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'. clip_value (float): Specifies how much to clip. grad (tuple[Tensor]): Gradients. Outputs: tuple[Tensor], clipped gradients. """ if clip_type not in (0, 1): return grad dt = F.dtype(grad) if clip_type == 0: new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt), F.cast(F.tuple_to_array((clip_value,)), dt)) else: new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt)) return new_grad
def construct(self, grid, prediction, pred_xy, pred_wh, y_true, gt_box): object_mask = y_true[:, :, :, :, 4:5] class_probs = y_true[:, :, :, :, 5:] grid_shape = P.Shape()(prediction)[1:3] grid_shape = P.Cast()(F.tuple_to_array(grid_shape[::-1]), ms.float32) pred_boxes = self.concat((pred_xy, pred_wh)) true_xy = y_true[:, :, :, :, :2] * grid_shape - grid true_wh = y_true[:, :, :, :, 2:4] true_wh = P.Select()(P.Equal()(true_wh, 0.0), P.Fill()(P.DType()(true_wh), P.Shape()(true_wh), 1.0), true_wh) true_wh = P.Log()(true_wh / self.anchors * self.input_shape) box_loss_scale = 2 - y_true[:, :, :, :, 2:3] * y_true[:, :, :, :, 3:4] gt_shape = P.Shape()(gt_box) gt_box = P.Reshape()(gt_box, (gt_shape[0], 1, 1, 1, gt_shape[1], gt_shape[2])) iou = self.iou(P.ExpandDims()(pred_boxes, -2), gt_box) # [batch, grid[0], grid[1], num_anchor, num_gt] best_iou = self.reduce_max(iou, -1) # [batch, grid[0], grid[1], num_anchor] ignore_mask = best_iou < self.ignore_threshold ignore_mask = P.Cast()(ignore_mask, ms.float32) ignore_mask = P.ExpandDims()(ignore_mask, -1) ignore_mask = F.stop_gradient(ignore_mask) xy_loss = object_mask * box_loss_scale * self.cross_entropy( prediction[:, :, :, :, :2], true_xy) wh_loss = object_mask * box_loss_scale * 0.5 * P.Square()( true_wh - prediction[:, :, :, :, 2:4]) confidence_loss = self.cross_entropy(prediction[:, :, :, :, 4:5], object_mask) confidence_loss = object_mask * confidence_loss + ( 1 - object_mask) * confidence_loss * ignore_mask class_loss = object_mask * self.cross_entropy( prediction[:, :, :, :, 5:], class_probs) # Get smooth loss xy_loss = self.reduce_sum(xy_loss, ()) wh_loss = self.reduce_sum(wh_loss, ()) confidence_loss = self.reduce_sum(confidence_loss, ()) class_loss = self.reduce_sum(class_loss, ()) loss = xy_loss + wh_loss + confidence_loss + class_loss return loss / P.Shape()(prediction)[0]
def construct(self): """position matrix generator""" range_vec_row_out = self.cast(F.tuple_to_array(F.make_range(self._length)), mstype.int32) range_vec_col_out = self.range_mat(range_vec_row_out, (self._length, -1)) tile_row_out = self.tile(range_vec_row_out, (self._length,)) tile_col_out = self.tile(range_vec_col_out, (1, self._length)) range_mat_out = self.range_mat(tile_row_out, (self._length, self._length)) transpose_out = self.range_mat(tile_col_out, (self._length, self._length)) distance_mat = self.sub(range_mat_out, transpose_out) distance_mat_clipped = C.clip_by_value(distance_mat, self._min_relative_position, self._max_relative_position) # Shift values to be >=0. Each integer still uniquely identifies a # relative position difference. final_mat = distance_mat_clipped + self._max_relative_position return final_mat
def construct(self, logits, label_ids, input_mask=None): """ Calculate loss Args: logits (Tensor): the probability distribution over vocabulary. label_ids (Tensor): the indices of input sequence tokens in the vocabulary. input_mask (Tensor): input sentences padding mask, where 0 indicates padding position. Returns: return_value (Tensor, mstype.float32): if is_training is False, directly return the logits, otherwise, return the computed loss. """ # logits [batch * (seq_length-1), vocab_size] label_ids [batch, seq_length-1] if self.is_training: label_ids = self.reshape( label_ids, self.last_idx) # label_ids [batch * (seq_length-1)] one_hot_labels = self.onehot( label_ids, self.num_labels, self.on_value, self.off_value) # [batch * (seq_length-1), vocab_size] per_example_loss = self.neg( self.reduce_sum(one_hot_labels * logits, self.last_idx)) # [batch * (seq_length-1)] # for PPL calculation in evaluation if input_mask is not None: input_mask = self.cast( self.reshape(input_mask, self.last_idx), mstype.float32) # [batch * (seq_length-1)] valid_loss_sum = self.reduce_sum(input_mask * per_example_loss, ()) valid_element_sum = self.reduce_sum( input_mask, ()) + self.cast(F.tuple_to_array( (1e-5, )), mstype.float32) loss = valid_loss_sum / valid_element_sum else: loss = self.reduce_mean(per_example_loss, self.last_idx) # a number return_value = self.cast(loss, mstype.float32) else: return_value = logits * 1.0 # [batch * (seq_length-1), vocab_size] return return_value