示例#1
0
 def construct(self, gradients, overflow):
     """AdamWeightDecayForBert"""
     lr = self.get_lr()
     cond = self.op_cast(F.fill(mstype.int32, self.op_shape(self.beta1), 1) *\
                         self.op_reshape(overflow, (())), mstype.bool_)
     beta1 = self.op_select(
         cond, self.op_cast(F.tuple_to_array((1.0, )), mstype.float32),
         self.beta1)
     beta2 = self.op_select(
         cond, self.op_cast(F.tuple_to_array((1.0, )), mstype.float32),
         self.beta2)
     if self.is_group:
         if self.is_group_lr:
             optim_result = self.hyper_map(
                 F.partial(_adam_opt, self.beta1, self.beta2,
                           self.eps), lr, self.weight_decay,
                 self.parameters, self.moments1, self.moments2, gradients,
                 self.decay_flags, self.optim_filter)
         else:
             optim_result = self.hyper_map(
                 F.partial(_adam_opt, beta1, beta2, self.eps, lr,
                           overflow), self.weight_decay, self.parameters,
                 self.moments1, self.moments2, gradients, self.decay_flags,
                 self.optim_filter)
     else:
         optim_result = self.hyper_map(
             F.partial(_adam_opt, self.beta1, self.beta2, self.eps, lr,
                       self.weight_decay), self.parameters, self.moments1,
             self.moments2, gradients, self.decay_flags, self.optim_filter)
     if self.use_parallel:
         self.broadcast_params(optim_result)
     return optim_result
示例#2
0
def _update_run_op_for_map(beta1, beta2, eps, lr, weight_decay_tensor, param,
                           m, v, gradient, decay_flag):
    op_mul = P.Mul()
    op_square = P.Square()
    op_sqrt = P.Sqrt()
    op_cast = P.Cast()
    op_reshape = P.Reshape()
    op_shape = P.Shape()

    param_fp32 = op_cast(param, mstype.float32)
    m_fp32 = op_cast(m, mstype.float32)
    v_fp32 = op_cast(v, mstype.float32)
    gradient_fp32 = op_cast(gradient, mstype.float32)

    next_m = op_mul(beta1, m_fp32) + op_mul(
        op_cast(F.tuple_to_array(
            (1.0, )), mstype.float32) - beta1, gradient_fp32)

    next_v = op_mul(beta2, v_fp32) + op_mul(
        op_cast(F.tuple_to_array(
            (1.0, )), mstype.float32) - beta2, op_square(gradient_fp32))

    update = next_m / (op_sqrt(next_v) + eps)
    if decay_flag:
        update = update + op_mul(weight_decay_tensor, param_fp32)

    update_with_lr = op_mul(lr, update)
    next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32))

    next_v = F.depend(next_v, F.assign(param, next_param))
    next_v = F.depend(next_v, F.assign(m, next_m))
    next_v = F.depend(next_v, F.assign(v, next_v))
    return next_v
示例#3
0
    def construct(self,
                  grads,
                  clip_type,
                  clip_value):
        """
        Construct gradient clip network.

        Args:
            grads (list): List of gradient tuples.
            clip_type (Tensor): The way to clip, 'value' or 'norm'.
            clip_value (Tensor): Specifies how much to clip.

        Returns:
            List, a list of clipped_grad tuples.
        """
        if clip_type != 0 and clip_type != 1:  # pylint: disable=R1714
            return grads

        new_grads = ()
        for grad in grads:
            dt = self.dtype(grad)
            if clip_type == 0:
                t = C.clip_by_value(grad, self.cast(F.tuple_to_array((-clip_value,)), dt),
                                    self.cast(F.tuple_to_array((clip_value,)), dt))
            else:
                t = self.clip_by_norm(grad, self.cast(F.tuple_to_array((clip_value,)), dt))
            new_grads = new_grads + (t,)

        return new_grads
示例#4
0
    def construct(self, x):
        num_batch = P.Shape()(x)[0]
        grid_size = P.Shape()(x)[2:4]

        # Reshape and transpose the feature to [n, 3, grid_size[0], grid_size[1], num_attrib]
        prediction = P.Reshape()(x, (num_batch,
                                     self.num_anchors_per_scale,
                                     self.num_attrib,
                                     grid_size[0],
                                     grid_size[1]))
        prediction = P.Transpose()(prediction, (0, 3, 4, 1, 2))

        range_x = range(grid_size[1])
        range_y = range(grid_size[0])
        grid_x = P.Cast()(F.tuple_to_array(range_x), ms.float32)
        grid_y = P.Cast()(F.tuple_to_array(range_y), ms.float32)
        # Tensor of shape [grid_size[0], grid_size[1], 1, 1] representing the coordinate of x/y axis for each grid
        grid_x = self.tile(self.reshape(grid_x, (1, 1, -1, 1, 1)), (1, grid_size[0], 1, 1, 1))
        grid_y = self.tile(self.reshape(grid_y, (1, -1, 1, 1, 1)), (1, 1, grid_size[1], 1, 1))
        # Shape is [grid_size[0], grid_size[1], 1, 2]
        grid = self.concat((grid_x, grid_y))

        box_xy = prediction[:, :, :, :, :2]
        box_wh = prediction[:, :, :, :, 2:4]
        box_confidence = prediction[:, :, :, :, 4:5]
        box_probs = prediction[:, :, :, :, 5:]

        box_xy = (self.sigmoid(box_xy) + grid) / P.Cast()(F.tuple_to_array((grid_size[1], grid_size[0])), ms.float32)
        box_wh = P.Exp()(box_wh) * self.anchors / self.input_shape
        box_confidence = self.sigmoid(box_confidence)
        box_probs = self.sigmoid(box_probs)

        if self.training:
            return grid, prediction, box_xy, box_wh
        return box_xy, box_wh, box_confidence, box_probs
示例#5
0
def _update_run_op(beta1, beta2, eps, lr, weight_decay, param, m, v, gradient,
                   decay_flag, optim_filter):
    """
    Update parameters.

    Args:
        beta1 (Tensor): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0).
        beta2 (Tensor): The exponential decay rate for the 2nd moment estimations. Should be in range (0.0, 1.0).
        eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0.
        lr (Tensor): Learning rate.
        weight_decay (Number): Weight decay. Should be equal to or greater than 0.
        param (Tensor): Parameters.
        m (Tensor): m value of parameters.
        v (Tensor): v value of parameters.
        gradient (Tensor): Gradient of parameters.
        decay_flag (bool): Applies weight decay or not.
        optim_filter (bool): Applies parameter update or not.

    Returns:
        Tensor, the new value of v after updating.
    """
    if optim_filter:
        op_mul = P.Mul()
        op_square = P.Square()
        op_sqrt = P.Sqrt()
        op_cast = P.Cast()
        op_reshape = P.Reshape()
        op_shape = P.Shape()

        param_fp32 = op_cast(param, mstype.float32)
        m_fp32 = op_cast(m, mstype.float32)
        v_fp32 = op_cast(v, mstype.float32)
        gradient_fp32 = op_cast(gradient, mstype.float32)

        next_m = op_mul(beta1, m_fp32) + op_mul(
            op_cast(F.tuple_to_array(
                (1.0, )), mstype.float32) - beta1, gradient_fp32)

        next_v = op_mul(beta2, v_fp32) + op_mul(
            op_cast(F.tuple_to_array(
                (1.0, )), mstype.float32) - beta2, op_square(gradient_fp32))

        update = next_m / (eps + op_sqrt(next_v))
        if decay_flag:
            update = op_mul(weight_decay, param_fp32) + update

        update_with_lr = op_mul(lr, update)
        next_param = param_fp32 - op_reshape(update_with_lr,
                                             op_shape(param_fp32))

        next_param = F.depend(
            next_param, F.assign(param, op_cast(next_param, F.dtype(param))))
        next_param = F.depend(next_param,
                              F.assign(m, op_cast(next_m, F.dtype(m))))
        next_param = F.depend(next_param,
                              F.assign(v, op_cast(next_v, F.dtype(v))))

        return op_cast(next_param, F.dtype(param))
    return gradient
示例#6
0
def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power,
                         beta2_power, beta1, beta2, eps, lr, gradient, param, m, v, ps_parameter, cache_enable):
    """Apply sparse adam optimizer to the weight parameter when the gradient is sparse."""
    success = True
    indices = gradient.indices
    values = gradient.values
    if ps_parameter and not cache_enable:
        op_shape = P.Shape()
        shapes = (op_shape(param), op_shape(m), op_shape(v),
                  op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1),
                  op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices))
        success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2,
                                               eps, values, indices), shapes), param))
        return success

    if not target:
        success = F.depend(success, sparse_opt(param, m, v, beta1_power, beta2_power, lr, beta1, beta2,
                                               eps, values, indices))
    else:
        op_mul = P.Mul()
        op_square = P.Square()
        op_sqrt = P.Sqrt()
        scatter_add = P.ScatterAdd(use_locking)

        success = F.depend(success, F.assign(m, op_mul(beta1, m)))
        success = F.depend(success, F.assign(v, op_mul(beta2, v)))

        grad_indices = gradient.indices
        grad_value = gradient.values

        next_m = scatter_add(m,
                             grad_indices,
                             op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value))

        next_v = scatter_add(v,
                             grad_indices,
                             op_mul(F.tuple_to_array((1.0,)) - beta2, op_square(grad_value)))

        if use_nesterov:
            m_temp = next_m * _scaler_ten
            F.assign(m, op_mul(beta1, next_m))
            div_value = scatter_add(m,
                                    op_mul(grad_indices, _scaler_one),
                                    op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value))
            param_update = div_value / (op_sqrt(next_v) + eps)
            F.assign(m, m_temp / _scaler_ten)
        else:
            param_update = next_m / (op_sqrt(next_v) + eps)

        lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power)
        next_param = param - lr_t * param_update

        success = F.depend(success, F.assign(param, next_param))
        success = F.depend(success, F.assign(m, next_m))
        success = F.depend(success, F.assign(v, next_v))

    return success
示例#7
0
    def construct(self, grads, clip_min, clip_max):
        new_grads = ()
        for grad in grads:
            dt = self.dtype(grad)

            t = C.clip_by_value(grad, self.cast(F.tuple_to_array((clip_min,)), dt),
                                self.cast(F.tuple_to_array((clip_max,)), dt))
            t = self.cast(t, dt)
            new_grads = new_grads + (t,)
        return new_grads
示例#8
0
def _clip_grad(clip_type, clip_value, grad):
    dt = F.dtype(grad)
    if clip_type == 0:
        new_grad = C.clip_by_value(
            grad, F.cast(F.tuple_to_array((-clip_value, )), dt),
            F.cast(F.tuple_to_array((clip_value, )), dt))
    else:
        new_grad = nn.ClipByNorm()(grad,
                                   F.cast(F.tuple_to_array((clip_value, )),
                                          dt))
    return new_grad
示例#9
0
def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v,
                   gradient, decay_flag):
    """
    Update parameters.

    Args:
        beta1 (Tensor): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0).
        beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0).
        eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0.
        lr (Tensor): Learning rate.
        weight_decay_tensor (Tensor): Weight decay. Should be equal to or greater than 0.
        param (Tensor): Parameters.
        m (Tensor): m value of parameters.
        v (Tensor): v value of parameters.
        gradient (Tensor): Gradient of parameters.

    Returns:
        Tensor, the new value of v after updating.
    """
    op_mul = P.Mul()
    op_square = P.Square()
    op_sqrt = P.Sqrt()
    op_cast = P.Cast()
    op_reshape = P.Reshape()
    op_shape = P.Shape()

    param = op_cast(param, mstype.float32)
    m = op_cast(m, mstype.float32)
    v = op_cast(v, mstype.float32)
    gradient = op_cast(gradient, mstype.float32)

    next_m = op_mul(beta1, m) + op_mul(
        op_cast(F.tuple_to_array((1.0, )), mstype.float32) - beta1, gradient)

    next_v = op_mul(beta2, v) + op_mul(
        op_cast(F.tuple_to_array(
            (1.0, )), mstype.float32) - beta2, op_square(gradient))

    update = next_m / (op_sqrt(next_v) + eps)
    if decay_flag:
        update = update + op_mul(weight_decay_tensor, param)

    update_with_lr = op_mul(lr, update)
    next_param = param - op_reshape(update_with_lr, op_shape(param))

    next_v = F.depend(next_v, F.assign(param, next_param))
    next_v = F.depend(next_v, F.assign(m, next_m))
    next_v = F.depend(next_v, F.assign(v, next_v))
    return next_v
    def construct(
        self,
        source_eos_ids,
        source_eos_mask,
        target_sos_ids,
        target_sos_mask,
        target_eos_ids,
        target_eos_mask,
    ):
        """Defines the computation performed."""
        source_ids = source_eos_ids
        source_mask = source_eos_mask
        target_ids = target_sos_ids
        target_mask = target_sos_mask
        label_ids = target_eos_ids
        label_weights = target_eos_mask

        weights = self.weights
        loss = self.network(source_ids, source_mask, target_ids, target_mask,
                            label_ids, label_weights)
        grads = self.grad(self.network,
                          weights)(source_ids, source_mask, target_ids,
                                   target_mask, label_ids, label_weights,
                                   self.cast(F.tuple_to_array((self.sens, )),
                                             mstype.float32))
        grads = self.clip_gradients(grads, GRADIENT_CLIP_TYPE,
                                    GRADIENT_CLIP_VALUE)
        if self.reducer_flag:
            # apply grad reducer on grads
            grads = self.grad_reducer(grads)

        succ = self.optimizer(grads)
        return F.depend(loss, succ)
示例#11
0
文件: gpt.py 项目: CaoE/mindspore
    def _attn(self, query, key, value, attention_mask):
        """
        Get the weighted score along the seq_length

        Inputs:
            query: the query matrix
            key: the key matrix
            value: the value matrix
            attention_mask: the attention mask matrix with shape (batch_size, 1, seq_length, seq_length)

        Returns:
            weighted_values: Tensor, the weighted sum scores
        """
        if not self.scale:
            query = query / F.cast(self.coeff, F.dtype(query))
            key = key / F.cast(self.coeff, F.dtype(key))

        score = self.batch_matmul(query, key)
        if self.scale:
            score = score / P.Cast()(self.scale_factor, P.DType()(score))

        ori_dtype = P.DType()(score)
        score = P.Cast()(score, mstype.float32)
        multiplu_out = P.Sub()(P.Cast()(F.tuple_to_array((1.0,)), P.DType()(score)),
                               P.Cast()(attention_mask, P.DType()(score)))

        adder = P.Mul()(multiplu_out, self.multiply_data)
        attention_scores = adder + score

        attention_scores = P.Cast()(attention_scores, ori_dtype)
        attention_probs = Softmax()(attention_scores)

        attention_probs = self.prob_dropout(attention_probs)
        weighted_values = self.batch_matmul(attention_probs, value)
        return weighted_values
    def construct(self, input_ids, input_mask, input_position=None, attention_mask=None, layer_past=None):
        """PanGu Alpha model"""
        if not self.use_past:
            layer_past = self.past

        input_embedding, embedding_table = self.word_embedding(input_ids)
        if not self.eod_reset:
            batch_size, seq_length = F.shape(input_ids)
            input_position = F.tuple_to_array(F.make_range(seq_length))
            input_position = P.Tile()(input_position, (batch_size, 1))
            attention_mask = self.get_attention_mask(input_mask)
            
        position_embedding = self.position_embedding(input_position)
        hidden_states = self.add(input_embedding, position_embedding)
        hidden_states = self.dropout(hidden_states)
        hidden_states = P.Cast()(hidden_states, mstype.float16)
        attention_mask = self.expand_dims(attention_mask, 1)

        present_layer = ()
        for i in range(self.num_layers):
            hidden_states, present = self.blocks[i](hidden_states,
                                                    attention_mask, layer_past)
            present_layer = present_layer + (present,)

        output_state = self.layernorm(hidden_states)
        output_state = F.cast(output_state, self.dtype)

        top_query_hidden_states = self.top_query_embedding(input_position)
        output_state, present = self.top_query_layer(output_state, top_query_hidden_states,
                                                     attention_mask, layer_past)
        present_layer = present_layer + (present,)

        return output_state, present_layer, embedding_table
示例#13
0
    def construct(self, input_ids, input_mask, layer_past=None):
        """GPT model"""
        if not self.use_past:
            layer_past = self.past

        input_embedding, embedding_table = self.word_embedding(input_ids)

        batch_size, seq_length = F.shape(input_ids)
        input_position = F.tuple_to_array(F.make_range(seq_length))
        input_position = P.Tile()(input_position, (batch_size, 1))

        position_embedding = self.position_embedding(input_position)
        hidden_states = input_embedding + position_embedding

        hidden_states = P.Cast()(hidden_states, mstype.float16)
        attention_mask = self.get_attention_mask(input_mask)
        attention_mask = P.ExpandDims()(attention_mask, 1)

        present_layer = ()
        for i in range(self.num_layers):
            hidden_states, present = self.blocks[i](hidden_states,
                                                    attention_mask, layer_past)
            present_layer = present_layer + (present, )

        output_state = self.layernorm(hidden_states)
        return output_state, present_layer, embedding_table
    def construct(self,
                  input_ids,
                  input_mask,
                  token_type_id,
                  next_sentence_labels,
                  masked_lm_positions,
                  masked_lm_ids,
                  masked_lm_weights):
        """Defines the computation performed."""
        weights = self.weights

        loss = self.network(input_ids,
                            input_mask,
                            token_type_id,
                            next_sentence_labels,
                            masked_lm_positions,
                            masked_lm_ids,
                            masked_lm_weights)
        grads = self.grad(self.network, weights)(input_ids,
                                                 input_mask,
                                                 token_type_id,
                                                 next_sentence_labels,
                                                 masked_lm_positions,
                                                 masked_lm_ids,
                                                 masked_lm_weights,
                                                 self.cast(F.tuple_to_array((self.sens,)),
                                                           mstype.float32))
        grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
        grads = self.grad_reducer(grads)
        succ = self.optimizer(grads)
        return F.depend(loss, succ)
    def construct(self, input_ids, input_mask, token_type_id, label_ids):
        """Defines the computation performed."""
        weights = self.weights
        for i in range(self.length):
            F.assign(self.saved_params[i], weights[i])

        for i in range(self.quant_embedding_list_length):
            quant_embedding = self.quantize_embedding(
                weights[self.quant_embedding_list[i]])
            F.assign(weights[self.quant_embedding_list[i]], quant_embedding)

        for i in range(self.quant_weight_list_length):
            quant_weight = self.quantize_weight(
                weights[self.quant_weight_list[i]])
            F.assign(weights[self.quant_weight_list[i]], quant_weight)

        grads = self.grad(self.network,
                          weights)(input_ids, input_mask, token_type_id,
                                   label_ids,
                                   self.cast(F.tuple_to_array((self.sens, )),
                                             mstype.float32))
        # apply grad reducer on grads
        grads = self.grad_reducer(grads)
        grads = self.hyper_map(
            F.partial(clip_grad, self.clip_type, self.clip_value), grads)

        for i in range(self.length):
            param = F.depend(self.saved_params[i], grads)
            F.assign(weights[i], param)

        succ = self.optimizer(grads)
        return succ
示例#16
0
 def construct(self, grads, clip_type, clip_value):
     """clip gradients"""
     if clip_type not in (0, 1):
         return grads
     new_grads = ()
     for grad in grads:
         dt = self.dtype(grad)
         if clip_type == 0:
             t = C.clip_by_value(
                 grad, self.cast(F.tuple_to_array((-clip_value, )), dt),
                 self.cast(F.tuple_to_array((clip_value, )), dt))
         else:
             t = self.clip_by_norm(
                 grad, self.cast(F.tuple_to_array((clip_value, )), dt))
         new_grads = new_grads + (t, )
     return new_grads
    def construct(self, prediction_scores, label_ids, label_weights):
        """
        Construct network to calculate loss.

        Args:
            prediction_scores (Tensor): Prediction scores.
            label_ids (Tensor): Labels.
            label_weights (Tensor): Mask tensor.

        Returns:
            Tensor, final loss.
        """
        label_shape = self.get_shape(label_ids)

        label_ids = self.reshape(label_ids, (label_shape[0] * label_shape[1],))
        label_weights = self.cast(
            self.reshape(label_weights, (label_shape[0] * label_shape[1],)),
            mstype.float32
        )
        one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value)

        per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx))
        numerator = self.reduce_sum(label_weights * per_example_loss, ())
        denominator = self.reduce_sum(label_weights, ()) + self.cast(F.tuple_to_array((1e-5,)), mstype.float32)
        loss = numerator / denominator

        return loss
示例#18
0
    def construct(self, prediction_scores, seq_relationship_score,
                  masked_lm_ids, masked_lm_weights, next_sentence_labels):
        """Defines the computation performed."""
        label_ids = self.reshape(masked_lm_ids, self.last_idx)
        label_weights = self.cast(
            self.reshape(masked_lm_weights, self.last_idx), mstype.float32)
        one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value,
                                     self.off_value)

        per_example_loss = self.neg(
            self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx))
        numerator = self.reduce_sum(label_weights * per_example_loss, ())
        denominator = self.reduce_sum(label_weights, ()) + self.cast(
            F.tuple_to_array((1e-5, )), mstype.float32)
        masked_lm_loss = numerator / denominator

        # next_sentence_loss
        labels = self.reshape(next_sentence_labels, self.last_idx)
        one_hot_labels = self.onehot(labels, 2, self.on_value, self.off_value)
        per_example_loss = self.neg(
            self.reduce_sum(one_hot_labels * seq_relationship_score,
                            self.last_idx))
        next_sentence_loss = self.reduce_mean(per_example_loss, self.last_idx)

        # total_loss
        total_loss = masked_lm_loss + next_sentence_loss

        return total_loss
示例#19
0
    def construct(self, prediction, pred_xy, pred_wh, y_true, gt_box, input_shape):
        """
        prediction : origin output from yolo
        pred_xy: (sigmoid(xy)+grid)/grid_size
        pred_wh: (exp(wh)*anchors)/input_shape
        y_true : after normalize
        gt_box: [batch, maxboxes, xyhw] after normalize
        """
        object_mask = y_true[:, :, :, :, 4:5]
        class_probs = y_true[:, :, :, :, 5:]
        true_boxes = y_true[:, :, :, :, :4]

        grid_shape = P.Shape()(prediction)[1:3]
        grid_shape = P.Cast()(F.tuple_to_array(grid_shape[::-1]), ms.float32)

        pred_boxes = self.concat((pred_xy, pred_wh))
        true_wh = y_true[:, :, :, :, 2:4]
        true_wh = P.Select()(P.Equal()(true_wh, 0.0),
                             P.Fill()(P.DType()(true_wh),
                                      P.Shape()(true_wh), 1.0),
                             true_wh)
        true_wh = P.Log()(true_wh / self.anchors * input_shape)
        # 2-w*h for large picture, use small scale, since small obj need more precise
        box_loss_scale = 2 - y_true[:, :, :, :, 2:3] * y_true[:, :, :, :, 3:4]

        gt_shape = P.Shape()(gt_box)
        gt_box = P.Reshape()(gt_box, (gt_shape[0], 1, 1, 1, gt_shape[1], gt_shape[2]))

        # add one more dimension for broadcast
        iou = self.iou(P.ExpandDims()(pred_boxes, -2), gt_box)
        # gt_box is x,y,h,w after normalize
        # [batch, grid[0], grid[1], num_anchor, num_gt]
        best_iou = self.reduce_max(iou, -1)
        # [batch, grid[0], grid[1], num_anchor]

        # ignore_mask IOU too small
        ignore_mask = best_iou < self.ignore_threshold
        ignore_mask = P.Cast()(ignore_mask, ms.float32)
        ignore_mask = P.ExpandDims()(ignore_mask, -1)
        # ignore_mask backpro will cause a lot maximunGrad and minimumGrad time consume.
        # so we turn off its gradient
        ignore_mask = F.stop_gradient(ignore_mask)

        confidence_loss = self.confidence_loss(object_mask, prediction[:, :, :, :, 4:5], ignore_mask)
        class_loss = self.class_loss(object_mask, prediction[:, :, :, :, 5:], class_probs)

        object_mask_me = P.Reshape()(object_mask, (-1, 1))  # [8, 72, 72, 3, 1]
        box_loss_scale_me = P.Reshape()(box_loss_scale, (-1, 1))
        pred_boxes_me = xywh2x1y1x2y2(pred_boxes)
        pred_boxes_me = P.Reshape()(pred_boxes_me, (-1, 4))
        true_boxes_me = xywh2x1y1x2y2(true_boxes)
        true_boxes_me = P.Reshape()(true_boxes_me, (-1, 4))
        ciou = self.giou(pred_boxes_me, true_boxes_me)
        ciou_loss = object_mask_me * box_loss_scale_me * (1 - ciou)
        ciou_loss_me = self.reduce_sum(ciou_loss, ())
        loss = ciou_loss_me * 10 + confidence_loss + class_loss
        batch_size = P.Shape()(prediction)[0]
        return loss / batch_size
示例#20
0
    def construct(self, grads, clip_type, clip_value):
        """
        construct a compute flow.
        """
        # pylint: disable=consider-using-in
        if clip_type != 0 and clip_type != 1:
            return grads

        new_grads = ()
        for grad in grads:
            if clip_type == 0:
                t = C.clip_by_value(grad, F.tuple_to_array((-clip_value,)),
                                    F.tuple_to_array((clip_value,)))
            else:
                t = self.clip_by_norm(grad, F.tuple_to_array((clip_value,)))
            new_grads = new_grads + (t,)

        return new_grads
    def construct(self, grads, clip_type, clip_value):
        # return grads
        if clip_type != 0 and clip_type != 1:
            return grads

        new_grads = ()
        for grad in grads:
            dt = self.dtype(grad)
            if clip_type == 0:
                t = C.clip_by_value(
                    grad, self.cast(F.tuple_to_array((-clip_value, )), dt),
                    self.cast(F.tuple_to_array((clip_value, )), dt))
            else:
                t = self.clip_by_norm(
                    grad, self.cast(F.tuple_to_array((clip_value, )), dt))
            new_grads = new_grads + (t, )

        return new_grads
示例#22
0
文件: gpt.py 项目: CaoE/mindspore
 def construct(self, logits, label, input_mask):
     logits = self.log_softmax(P.Cast()(logits, mstype.float32))
     label = P.Reshape()(label, (-1,))
     one_hot_label = self.onehot(label, self.vocab_size, self.on_value, self.off_value)
     loss_sum = P.Neg()(self.sum(logits*one_hot_label, (-1,)))
     input_mask = P.Reshape()(input_mask, (-1,))
     numerator = self.sum(loss_sum*input_mask)
     denominator = self.sum(input_mask) + P.Cast()(F.tuple_to_array((1e-5,)), mstype.float32)
     loss = numerator / denominator
     return loss
示例#23
0
    def construct(self, input_tensor):
        attention_scores = input_tensor
        attention_scores = self.cast(attention_scores, mstype.float32)
        if self.has_attention_mask:
            attention_mask = self.expand_dims(self.attention_mask, 1)
            multiply_out = self.sub(self.cast(F.tuple_to_array((1.0,)), mstype.float32),
                                    self.cast(attention_mask, self.get_dtype(attention_scores)))

            adder = self.multiply(multiply_out, self.multiply_data)
            attention_scores = self.add(adder, attention_scores)
        return attention_scores
示例#24
0
    def construct(self, x, input_shape):
        """construct method"""
        num_batch = P.Shape()(x)[0]
        grid_size = P.Shape()(x)[2:4]

        # Reshape and transpose the feature to [n, grid_size[0], grid_size[1], 3, num_attrib]
        prediction = P.Reshape()(x, (num_batch,
                                     self.num_anchors_per_scale,
                                     self.num_attrib,
                                     grid_size[0],
                                     grid_size[1]))
        prediction = P.Transpose()(prediction, (0, 3, 4, 1, 2))

        range_x = range(grid_size[1])
        range_y = range(grid_size[0])
        grid_x = P.Cast()(F.tuple_to_array(range_x), ms.float32)
        grid_y = P.Cast()(F.tuple_to_array(range_y), ms.float32)
        # Tensor of shape [grid_size[0], grid_size[1], 1, 1] representing the coordinate of x/y axis for each grid
        # [batch, gridx, gridy, 1, 1]
        grid_x = self.tile(self.reshape(grid_x, (1, 1, -1, 1, 1)), (1, grid_size[0], 1, 1, 1))
        grid_y = self.tile(self.reshape(grid_y, (1, -1, 1, 1, 1)), (1, 1, grid_size[1], 1, 1))
        # Shape is [grid_size[0], grid_size[1], 1, 2]
        grid = self.concat((grid_x, grid_y))

        box_xy = prediction[:, :, :, :, :2]
        box_wh = prediction[:, :, :, :, 2:4]
        box_confidence = prediction[:, :, :, :, 4:5]
        box_probs = prediction[:, :, :, :, 5:]

        # gridsize1 is x
        # gridsize0 is y
        box_xy = (self.scale_x_y * self.sigmoid(box_xy) - self.offset_x_y + grid) / \
                 P.Cast()(F.tuple_to_array((grid_size[1], grid_size[0])), ms.float32)
        # box_wh is w->h
        box_wh = P.Exp()(box_wh) * self.anchors / input_shape
        box_confidence = self.sigmoid(box_confidence)
        box_probs = self.sigmoid(box_probs)

        if self.conf_training:
            return prediction, box_xy, box_wh
        return self.concat((box_xy, box_wh, box_confidence, box_probs))
示例#25
0
    def construct(self, beta1, beta2, gradient, eps, weight_decay_tensor, lr):
        param_fp32 = self.op_cast(self.param, mstype.float32)
        m_fp32 = self.op_cast(self.m, mstype.float32)
        v_fp32 = self.op_cast(self.v, mstype.float32)
        gradient_fp32 = self.op_cast(gradient, mstype.float32)

        next_m = self.op_mul(beta1, m_fp32) + \
                 self.op_mul(self.op_cast(F.tuple_to_array((1.0,)), mstype.float32) - beta1, gradient_fp32)
        next_v = self.op_mul(beta2, v_fp32) + self.op_mul(self.op_cast(F.tuple_to_array((1.0,)), mstype.float32) - \
                                                          beta2, self.op_square(gradient_fp32))
        update = next_m / (eps + self.op_sqrt(next_v))
        if self.decay_flag:
            update = self.op_mul(weight_decay_tensor, param_fp32) + update
        update_with_lr = self.op_mul(lr, update)
        next_param = param_fp32 - self.op_reshape(update_with_lr,
                                                  self.op_shape(param_fp32))

        next_v = F.depend(next_v, F.assign(self.param, next_param))
        next_v = F.depend(next_v, F.assign(self.m, next_m))
        next_v = F.depend(next_v, F.assign(self.v, next_v))
        return next_v
    def construct(self, prediction_scores, label_ids, label_weights):
        """Defines the computation performed."""
        label_ids = self.reshape(label_ids, self.flat_shape)
        label_weights = self.cast(self.reshape(label_weights, self.flat_shape), mstype.float32)
        one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value)

        per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx))
        numerator = self.reduce_sum(label_weights * per_example_loss, ())
        denominator = self.reduce_sum(label_weights, ()) + \
                      self.cast(F.tuple_to_array((1e-5,)), mstype.float32)
        loss = numerator / denominator
        return loss
def _clip_grad(clip_type, clip_value, grad):
    """
    Clip gradients.

    Inputs:
        clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'.
        clip_value (float): Specifies how much to clip.
        grad (tuple[Tensor]): Gradients.

    Outputs:
        tuple[Tensor], clipped gradients.
    """
    if clip_type not in (0, 1):
        return grad
    dt = F.dtype(grad)
    if clip_type == 0:
        new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
                                   F.cast(F.tuple_to_array((clip_value,)), dt))
    else:
        new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
    return new_grad
示例#28
0
    def construct(self, grid, prediction, pred_xy, pred_wh, y_true, gt_box):

        object_mask = y_true[:, :, :, :, 4:5]
        class_probs = y_true[:, :, :, :, 5:]

        grid_shape = P.Shape()(prediction)[1:3]
        grid_shape = P.Cast()(F.tuple_to_array(grid_shape[::-1]), ms.float32)

        pred_boxes = self.concat((pred_xy, pred_wh))
        true_xy = y_true[:, :, :, :, :2] * grid_shape - grid
        true_wh = y_true[:, :, :, :, 2:4]
        true_wh = P.Select()(P.Equal()(true_wh,
                                       0.0), P.Fill()(P.DType()(true_wh),
                                                      P.Shape()(true_wh), 1.0),
                             true_wh)
        true_wh = P.Log()(true_wh / self.anchors * self.input_shape)
        box_loss_scale = 2 - y_true[:, :, :, :, 2:3] * y_true[:, :, :, :, 3:4]

        gt_shape = P.Shape()(gt_box)
        gt_box = P.Reshape()(gt_box,
                             (gt_shape[0], 1, 1, 1, gt_shape[1], gt_shape[2]))

        iou = self.iou(P.ExpandDims()(pred_boxes, -2),
                       gt_box)  # [batch, grid[0], grid[1], num_anchor, num_gt]
        best_iou = self.reduce_max(iou,
                                   -1)  # [batch, grid[0], grid[1], num_anchor]
        ignore_mask = best_iou < self.ignore_threshold
        ignore_mask = P.Cast()(ignore_mask, ms.float32)
        ignore_mask = P.ExpandDims()(ignore_mask, -1)
        ignore_mask = F.stop_gradient(ignore_mask)

        xy_loss = object_mask * box_loss_scale * self.cross_entropy(
            prediction[:, :, :, :, :2], true_xy)
        wh_loss = object_mask * box_loss_scale * 0.5 * P.Square()(
            true_wh - prediction[:, :, :, :, 2:4])
        confidence_loss = self.cross_entropy(prediction[:, :, :, :, 4:5],
                                             object_mask)
        confidence_loss = object_mask * confidence_loss + (
            1 - object_mask) * confidence_loss * ignore_mask
        class_loss = object_mask * self.cross_entropy(
            prediction[:, :, :, :, 5:], class_probs)

        # Get smooth loss
        xy_loss = self.reduce_sum(xy_loss, ())
        wh_loss = self.reduce_sum(wh_loss, ())
        confidence_loss = self.reduce_sum(confidence_loss, ())
        class_loss = self.reduce_sum(class_loss, ())

        loss = xy_loss + wh_loss + confidence_loss + class_loss
        return loss / P.Shape()(prediction)[0]
示例#29
0
 def construct(self):
     """position matrix generator"""
     range_vec_row_out = self.cast(F.tuple_to_array(F.make_range(self._length)), mstype.int32)
     range_vec_col_out = self.range_mat(range_vec_row_out, (self._length, -1))
     tile_row_out = self.tile(range_vec_row_out, (self._length,))
     tile_col_out = self.tile(range_vec_col_out, (1, self._length))
     range_mat_out = self.range_mat(tile_row_out, (self._length, self._length))
     transpose_out = self.range_mat(tile_col_out, (self._length, self._length))
     distance_mat = self.sub(range_mat_out, transpose_out)
     distance_mat_clipped = C.clip_by_value(distance_mat,
                                            self._min_relative_position,
                                            self._max_relative_position)
     # Shift values to be >=0. Each integer still uniquely identifies a
     # relative position difference.
     final_mat = distance_mat_clipped + self._max_relative_position
     return final_mat
示例#30
0
    def construct(self, logits, label_ids, input_mask=None):
        """
        Calculate loss

        Args:
            logits (Tensor): the probability distribution over vocabulary.
            label_ids (Tensor): the indices of input sequence tokens in the vocabulary.
            input_mask (Tensor): input sentences padding mask, where 0 indicates padding position.

        Returns:
            return_value (Tensor, mstype.float32): if is_training is False, directly return the logits, otherwise,
                                                   return the computed loss.
        """

        # logits [batch * (seq_length-1), vocab_size]   label_ids [batch, seq_length-1]
        if self.is_training:
            label_ids = self.reshape(
                label_ids, self.last_idx)  # label_ids [batch * (seq_length-1)]
            one_hot_labels = self.onehot(
                label_ids, self.num_labels, self.on_value,
                self.off_value)  # [batch * (seq_length-1), vocab_size]
            per_example_loss = self.neg(
                self.reduce_sum(one_hot_labels * logits,
                                self.last_idx))  # [batch * (seq_length-1)]

            # for PPL calculation in evaluation
            if input_mask is not None:
                input_mask = self.cast(
                    self.reshape(input_mask, self.last_idx),
                    mstype.float32)  # [batch * (seq_length-1)]

                valid_loss_sum = self.reduce_sum(input_mask * per_example_loss,
                                                 ())
                valid_element_sum = self.reduce_sum(
                    input_mask, ()) + self.cast(F.tuple_to_array(
                        (1e-5, )), mstype.float32)
                loss = valid_loss_sum / valid_element_sum
            else:
                loss = self.reduce_mean(per_example_loss,
                                        self.last_idx)  # a number
            return_value = self.cast(loss, mstype.float32)
        else:
            return_value = logits * 1.0  # [batch * (seq_length-1), vocab_size]

        return return_value