Пример #1
0
 def construct(self, logits, label):
     label = self.onehot(label, F.shape(logits)[-1], self.on_value, self.off_value)
     sigmiod_cross_entropy = self.sigmiod_cross_entropy(logits, label)
     sigmoid = self.sigmoid(logits)
     label = F.cast(label, mstype.float32)
     p_t = label * sigmoid + (1 - label) * (1 - sigmoid)
     modulating_factor = self.pow(1 - p_t, self.gamma)
     alpha_weight_factor = label * self.alpha + (1 - label) * (1 - self.alpha)
     focal_loss = modulating_factor * alpha_weight_factor * sigmiod_cross_entropy
     return focal_loss
Пример #2
0
def _tensor_apply_decay_with_sparse(weight_decay, if_apply, weight, gradient):
    """Get grad with weight_decay."""
    if if_apply:
        indices = gradient.indices
        values = op_add(
            (op_gather(weight, indices, 0) *
             F.cast(weight_decay, F.dtype(weight)), gradient.values))
        shape = gradient.dense_shape
        return RowTensor(indices, values, shape)
    return gradient
Пример #3
0
    def construct(self, prob, halting_prob, n_updates):
        # zeros = self.zeros_like(halting_prob)
        # ones = self.ones_like(halting_prob)

        # Mask for inputs which have not halted last cy
        running = F.cast(halting_prob < 1.0, ms.float32)
        # running = self.select(halting_prob < 1.0,ones,zeros)

        # Add the halting probability for this step to the halting
        # probabilities for those input which haven't halted yet
        add_prob = prob * running
        new_prob = halting_prob + add_prob
        mask_run = F.cast(new_prob <= self.threshold, ms.float32)
        mask_halt = F.cast(new_prob > self.threshold, ms.float32)
        # mask_run = self.select(new_prob <= self.threshold,ones,zeros)
        # mask_halt = self.select(new_prob > self.threshold,ones,zeros)

        # Mask of inputs which haven't halted, and didn't halt this step
        still_running = mask_run * running
        running_prob = halting_prob + prob * still_running

        # Mask of inputs which halted at this step
        new_halted = mask_halt * running

        # Compute remainders for the inputs which halted at this step
        remainders = new_halted * (1.0 - running_prob)

        # Add the remainders to those inputs which halted at this step
        # halting_prob = new_prob + remainders
        dp = add_prob + remainders

        # Increment n_updates for all inputs which are still running
        # n_updates = n_updates + running
        dn = running

        # Compute the weight to be applied to the new state and output
        # 0 when the input has already halted
        # prob when the input hasn't halted yet
        # the remainders when it halted this step
        update_weights = prob * still_running + new_halted * remainders
        w = F.expand_dims(update_weights, -1)

        return w, dp, dn
Пример #4
0
def _convert_img_dtype_to_float32(img, max_val):
    """convert img dtype to float32"""
    # Ususally max_val is 1.0 or 255, we will do the scaling if max_val > 1.
    # We will scale img pixel value if max_val > 1. and just cast otherwise.
    ret = F.cast(img, mstype.float32)
    max_val = F.scalar_cast(max_val, mstype.float32)
    if max_val > 1.:
        scale = 1. / max_val
        ret = ret * scale
    return ret
Пример #5
0
 def construct(self, x):
     input_shape = F.shape(x)[2:4]
     input_shape = F.cast(self.tenser_to_array(input_shape), ms.float32)
     big_object_output, medium_object_output, small_object_output = self.feature_map(
         x)
     output_big = self.detect_1(big_object_output, input_shape)
     output_me = self.detect_2(medium_object_output, input_shape)
     output_small = self.detect_3(small_object_output, input_shape)
     # big is the final output which has smallest feature map
     return output_big, output_me, output_small
    def construct(self, x, query_hidden_state, input_mask, layer_past=None):
        input_x = self.layernorm1(x)
        input_x = F.cast(input_x, self.dtype)
        attention, layer_present = self.attention(input_x,
                                                  query_hidden_state,
                                                  input_mask,
                                                  layer_past)
        if self.post_layernorm_residual:
            x = self.add(input_x, attention)
        else:
            x = self.add(x, attention)

        output_x = self.layernorm2(x)
        output_x = F.cast(output_x, self.dtype)
        mlp_logit = self.output(output_x)
        if self.post_layernorm_residual:
            output = self.last_add(output_x, mlp_logit)
        else:
            output = self.last_add(x, mlp_logit)
        return output, layer_present
Пример #7
0
 def construct(self, input_ids):
     """evaluation net"""
     input_mask = F.cast(F.not_equal(input_ids, 0), mstype.float32)
     logits = self.backbone(input_ids, input_mask)
     outputs = None
     if self.generate:
         outputs = nn.LogSoftmax()(logits)
         outputs = F.tensor_pow(np.e, outputs)
     else:
         outputs = self.argmax(logits)
     return outputs
def _clip_grad(clip_type, clip_value, grad):
    """
    Clip gradients.

    Inputs:
        clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'.
        clip_value (float): Specifies how much to clip.
        grad (tuple[Tensor]): Gradients.

    Outputs:
        tuple[Tensor], clipped gradients.
    """
    if clip_type not in (0, 1):
        return grad
    dt = F.dtype(grad)
    if clip_type == 0:
        new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
                                   F.cast(F.tuple_to_array((clip_value,)), dt))
    else:
        new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
    return new_grad
Пример #9
0
def _tensors_cast_datatype(datatype, grad):
    """
    Cast gradient to datatype.

    Args:
        datatype (mstype): the destination datatype of gradient.
        grad (Tensor): The gradient tensor before operation.

    Returns:
        Tensor, the gradient tensor after operation.
    """
    return F.cast(grad, datatype)
Пример #10
0
def _tensors_cast_datatype(datatype, parameters):
    """
    Cast parameters to datatype.

    Args:
        datatype (mstype): the destination datatype of parameters.
        parameters (Tensor): The parameters before operation.

    Returns:
        Tensor, the parameters after operation.
    """
    return F.cast(parameters, datatype)
Пример #11
0
    def construct(self, pred_label, gt_label, num_matched_boxes):
        gt_label = F.cast(gt_label, mstype.int32)
        mask = F.cast(self.less(0, gt_label), mstype.float32)
        gt_label_shape = F.shape(gt_label)
        pred_label = F.reshape(pred_label, (-1, self.num_classes))
        gt_label = F.reshape(gt_label, (-1, ))
        cross_entropy = self.cross_entropy(pred_label, gt_label)
        cross_entropy = F.reshape(cross_entropy, gt_label_shape)

        # Hard example mining
        num_matched_boxes = F.reshape(num_matched_boxes, (-1, ))
        neg_masked_cross_entropy = F.cast(cross_entropy * (1 - mask),
                                          mstype.float16)
        _, loss_idx = self.sort_descend(neg_masked_cross_entropy,
                                        self.num_boxes)
        _, relative_position = self.sort(F.cast(loss_idx, mstype.float16),
                                         self.num_boxes)
        num_neg_boxes = self.minimum(num_matched_boxes * self.neg_pre_positive,
                                     self.num_boxes)
        tile_num_neg_boxes = self.tile(self.expand_dims(num_neg_boxes, -1),
                                       (1, self.num_boxes))
        top_k_neg_mask = F.cast(
            self.less(relative_position, tile_num_neg_boxes), mstype.float32)
        class_loss = self.reduce_sum(cross_entropy * (mask + top_k_neg_mask),
                                     1)
        return self.reduce_mean(
            class_loss / F.cast(num_matched_boxes, mstype.float32), 0)
    def _attn(self, query, key, value, attention_mask):
        """
        Get the weighted score along the seq_length
        Inputs:
            query: the query matrix
            key: the key matrix
            value: the value matrix
            attention_mask: the attention mask matrix with shape (batch_size, 1, seq_length, seq_length)
        Returns:
            weighted_values: Tensor, the weighted sum scores
        """
        if not self.scale:
            query = query / F.cast(self.coeff, F.dtype(query))
            key = key / F.cast(self.coeff, F.dtype(key))

        score = self.batch_matmul(query, key)
        if self.scale:
            score = self.real_div(
                score,
                P.Cast()(self.scale_factor, P.DType()(score)))

        ori_dtype = P.DType()(score)
        score = P.Cast()(score, mstype.float32)
        multiplu_out = self.sub(
            P.Cast()(F.tuple_to_array((1.0,)), P.DType()(score)),
            P.Cast()(attention_mask, P.DType()(score)))

        adder = self.mul(multiplu_out, self.multiply_data)
        attention_scores = self.add(adder, score)

        shape = F.shape(attention_scores)
        attention_probs = self.softmax(
            F.reshape(attention_scores,
                      (shape[0], -1, shape[-1])))
        attention_probs = P.Cast()(attention_probs, ori_dtype)
        attention_probs = F.reshape(attention_probs, shape)

        attention_probs = self.prob_dropout(attention_probs)
        weighted_values = self.batch_matmul(attention_probs, value)
        return weighted_values
Пример #13
0
def _tensors_cast_datatype_with_sparse(datatype, grad):
    """
    Cast gradient to datatype.

    Args:
        datatype (mstype): the destination datatype of gradient.
        grad (RowTensor): The gradient before operation.

    Returns:
        RowTensor, the gradient after operation.
    """
    dout = F.cast(grad.values, datatype)
    return RowTensor(grad.indices, dout, grad.dense_shape)
Пример #14
0
 def construct(self, x, hx):
     """construct"""
     x = F.cast(x, mstype.float16)
     if self.batch_first:
         x = self.transpose(x, (1, 0, 2))
     # stack lstm
     h, c = hx
     hn = cn = None
     for i in range(self.num_layers):
         if self.bidirectional:
             x, (hn, cn) = self.lstm(x, h[i], c[i], self.weight_fw[i],
                                     self.bias_fw[i], self.weight_bw[i],
                                     self.bias_bw[i])
         else:
             x, (hn, cn) = self.lstm(x, h[i], c[i], self.weight_fw[i],
                                     self.bias_fw[i])
     if self.batch_first:
         x = self.transpose(x, (1, 0, 2))
     x = F.cast(x, mstype.float32)
     hn = F.cast(x, mstype.float32)
     cn = F.cast(x, mstype.float32)
     return x, (hn, cn)
Пример #15
0
    def construct(self, output_hm, output_wh, output_off, output_kps, hm,
                  reg_mask, ind, wh, wight_mask, hm_offset, hps_mask,
                  landmarks):
        """
        Construct method.
        """
        hm_loss = self.cls_loss(output_hm, hm)  # 1. focal loss, center points
        wh_loss = self.reg_loss(output_wh, ind, wh,
                                wight_mask)  # 2. weight and height
        off_loss = self.reg_loss(output_off, ind, hm_offset,
                                 wight_mask)  # 3. offset
        lm_loss = self.reg_loss_cmask(output_kps, hps_mask, ind,
                                      landmarks)  # 4. landmark loss

        loss = self.hm_weight * hm_loss + self.wh_weight * wh_loss + \
               self.off_weight * off_loss + self.lm_weight * lm_loss

        # depend is needed when wight_mask and reg_mask is not been used
        F.depend(loss, F.sqrt(F.cast(wight_mask, mstype.float32)))
        F.depend(loss, F.sqrt(F.cast(reg_mask, mstype.float32)))
        # add print when you want to see loss detail and do debug
        return loss
Пример #16
0
def _tensors_cast_datatype_with_sparse(datatype, grad):
    """
    Cast gradient to datatype.

    Args:
        datatype (mstype): the destination datatype of gradient.
        grad (Tuple): The gradient tensor before operation.

    Returns:
        Tuple, the gradient tuple after operation.
    """
    dout = F.cast(grad[1], datatype)
    return (grad[0], dout, grad[2])
Пример #17
0
    def construct(self, data, coord_mask, conf_pos_mask, conf_neg_mask, cls_mask, t_coord, t_conf, t_cls, gt_list,
                  coord_mask_1, conf_pos_mask_1, conf_neg_mask_1, cls_mask_1, t_coord_1, t_conf_1, t_cls_1, gt_list_1,
                  coord_mask_2, conf_pos_mask_2, conf_neg_mask_2, cls_mask_2, t_coord_2, t_conf_2, t_cls_2, gt_list_2,
                  sens=None):
        '''construct'''

        weights = self.weights
        loss = self.network(data, coord_mask, conf_pos_mask, conf_neg_mask, cls_mask, t_coord, t_conf, t_cls, gt_list,
                            coord_mask_1, conf_pos_mask_1, conf_neg_mask_1, cls_mask_1, t_coord_1, t_conf_1, t_cls_1,
                            gt_list_1, coord_mask_2, conf_pos_mask_2, conf_neg_mask_2, cls_mask_2, t_coord_2, t_conf_2,
                            t_cls_2, gt_list_2)
        # init overflow buffer
        init = self.alloc_status()
        # clear overflow buffer
        init = F.depend(init, loss)
        clear_status = self.clear_status(init)

        if sens is None:
            scaling_sens = self.loss_scale
        else:
            scaling_sens = sens
        scaling_sens = F.depend(scaling_sens, clear_status)

        grads = self.grad(self.network, weights)(data, coord_mask, conf_pos_mask, conf_neg_mask, cls_mask, t_coord,
                                                 t_conf, t_cls, gt_list, coord_mask_1, conf_pos_mask_1, conf_neg_mask_1,
                                                 cls_mask_1, t_coord_1, t_conf_1, t_cls_1, gt_list_1, coord_mask_2,
                                                 conf_pos_mask_2, conf_neg_mask_2, cls_mask_2, t_coord_2, t_conf_2,
                                                 t_cls_2, gt_list_2, F.cast(scaling_sens, F.dtype(loss)))

        grads = self.hyper_map(F.partial(_grad_scale, scaling_sens), grads)
        if self.reducer_flag:
            grads = self.grad_reducer(grads)

        # get the overflow buffer
        init = F.depend(init, grads)
        get_status = self.get_status(init)
        init = F.depend(init, get_status)

        # sum overflow buffer elements, 0:not overflow , >0:overflow
        flag_sum = self.reduce_sum(init, (0,))
        if self.is_distributed:
            # sum overflow flag over devices
            flag_reduce = self.allreduce(flag_sum)
            cond = self.less_equal(self.base, flag_reduce)
        else:
            cond = self.less_equal(self.base, flag_sum)

        opt = self.optimizer(grads)

        ret = (loss, cond, scaling_sens)
        return F.depend(ret, opt)
Пример #18
0
    def construct(self, x):
        # VGG16 backbone: block1~5
        block4, x = self.backbone(x)

        # SSD blocks: block6~7
        x = self.b6_1(x)  # 1024
        x = self.b6_2(x)

        x = self.b7_1(x)  # 1024
        x = self.b7_2(x)
        block7 = x

        # Extra Feature Layers: block8~11
        x = self.b8_1(x)  # 256
        x = self.b8_2(x)  # 512
        block8 = x

        x = self.b9_1(x)  # 128
        x = self.b9_2(x)  # 256
        block9 = x

        x = self.b10_1(x)  # 128
        x = self.b10_2(x)  # 256
        block10 = x

        x = self.b11_1(x)  # 128
        x = self.b11_2(x)  # 256
        block11 = x

        # boxes
        multi_feature = (block4, block7, block8, block9, block10, block11)
        pred_loc, pred_label = self.multi_box(multi_feature)
        if not self.training:
            pred_label = self.activation(pred_label)
        pred_loc = F.cast(pred_loc, mstype.float32)
        pred_label = F.cast(pred_label, mstype.float32)
        return pred_loc, pred_label
    def construct(self, input_ids, input_position=None, attention_mask=None):

        tokens = self.slice(input_ids, (0, 0), (self.batch_size, -1), (1, 1))

        input_position = self.slice(input_position, (0, 0), (self.batch_size, self.len), (1, 1))
        attention_mask = self.slice_mask(attention_mask, (0, 0, 0),
                                         (self.batch_size, self.len, self.len),
                                         (1, 1, 1))

        input_mask = F.cast(self.not_equal(tokens, self.eos_token),
                            mstype.float32)
        logits = self.network(tokens, input_mask, input_position, attention_mask)
        labels = self.slice(input_ids, (0, 1), (self.batch_size, self.len + 1),
                            (1, 1))
        output = self.loss(logits, labels, input_mask)
        return output
    def construct(self, input_ids, input_position, attention_mask):
        #tokens = input_ids[:, :-1]
        ret = None
        for i in range(self.micro_batch_step):
            micro_input, micro_input_position, micro_attention_mask = self.micro_input[i](input_ids, i, input_position, attention_mask)
            tokens = self.slice(micro_input, (0, 0), (self.batch_size // self.micro_batch_step, -1), (1, 1))

            input_mask = F.cast(self.not_equal(tokens, self.eos_token), mstype.float32)
            logits = self.network(tokens, input_mask, micro_input_position, micro_attention_mask)
            labels = self.slice(micro_input, (0, 1), (self.batch_size // self.micro_batch_step,
                                                      self.len + 1), (1, 1))
            output = self.loss(logits, labels, input_mask)
            if ret is not None:
                ret = ret + output
            else:
                ret = output
        return ret
Пример #21
0
    def construct(self, positions, forces, energy):
        outputs = self._network(positions)
        foutputs = -1 * self.grad_op(self._network)(positions)
        if self.add_cast_fp32:
            forces = F.mixed_precision_cast(ms.float32, forces)
            energy = F.mixed_precision_cast(ms.float32, energy)
            outputs = F.cast(outputs, ms.float32)

        if self._energy_fn is None:
            eloss = 0
        else:
            eloss = self._energy_fn(outputs, energy)

        if self._force_fn is None:
            floss = 0
        else:
            floss = self._force_fn(foutputs, forces)
        
        return eloss, floss, outputs, energy, foutputs, forces
Пример #22
0
    def construct(self, x, h, c, w_f, b_f, w_b=None, b_b=None):
        """construct"""
        x = F.cast(x, mstype.float16)
        if self.bidirectional:
            y1, h1, c1, _, _, _, _, _ = self.dynamic_rnn(
                x, w_f, b_f, None, h[0], c[0])
            r_x = self.reverseV2(x)
            y2, h2, c2, _, _, _, _, _ = self.dynamic_rnn(
                r_x, w_b, b_b, None, h[1], c[1])
            y2 = self.reverseV2(y2)

            output = self.concat((y1, y2))
            hn = self.concat((h1, h2))
            cn = self.concat((c1, c2))
            return output, (hn, cn)

        y1, h1, c1, _, _, _, _, _ = self.dynamic_rnn(x, w_f, b_f, None, h[0],
                                                     c[0])
        return y1, (h1, c1)
    def construct(self, input_ids, input_mask, table, input_position, attention_mask, layer_past=None):
        """PANGUALPHA model"""
        if not self.use_past:
            layer_past = self.past

        hidden_states = self.pangu_alpha_embedding(input_ids, table, input_position)
        attention_mask = self.pangu_alpha_mask(input_mask, attention_mask)

        present_layer = ()
        for i in range(self.num_layers-1):
            hidden_states, present = self.blocks[i](hidden_states,
                                                    attention_mask, layer_past)
            present_layer = present_layer + (present,)
        top_query_hidden_states = self.top_query_embedding(input_position)
        hidden_states, present = self.blocks[self.num_layers-1](hidden_states, top_query_hidden_states,
                                                                attention_mask, layer_past)
        present_layer = present_layer + (present,)   
        output_state = self.layernorm(hidden_states)
        output_state = F.cast(output_state, self.dtype) 
        return output_state, present_layer
Пример #24
0
def _tensors_allreduce_post(degree, mean, allreduce_filter, grad):
    """
    Apply allreduce on gradient in PyNative mode.

    Args:
        degree (int): The mean coefficient.
        mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients.
        allgather (Primitive): The communication operator for sparse gradients.
        allreduce (Primitive): The communication operator for gradients.
        allreduce_filter (bool): When it is true, allreduce would apply.
        grad (Tensor): The gradient tensor before operation.

    Returns:
        Tensor, the gradient tensor after operation.
    """
    if allreduce_filter:
        if mean:
            grad = F.tensor_mul(grad, F.cast(degree, F.dtype(grad)))
        return grad
    return grad
Пример #25
0
    def __init__(self, tot_atoms):
        super().__init__()
        # tot_atoms: A
        # tot_neigh: N =  A - 1
        tot_neigh = tot_atoms - 1
        arange = nn.Range(tot_atoms)
        nrange = nn.Range(tot_neigh)

        self.ones = P.Ones()
        self.aones = self.ones((tot_atoms), ms.int32)
        self.nones = self.ones((tot_neigh), ms.int32)

        # neighbors for no connection (A*N)
        # [[0,0,...,0],
        #  [1,1,...,1],
        #  ...........,
        #  [N,N,...,N]]
        self.nnc = F.expand_dims(arange(), -1) * self.nones
        # copy of the index range (A*N)
        # [[0,1,...,N-1],
        #  [0,1,...,N-1],
        #  ...........,
        #  [0,1,...,N-1]]
        crange = self.ones((tot_atoms, 1), ms.int32) * nrange()
        # neighbors for full connection (A*N)
        # [[1,2,3,...,N],
        #  [0,2,3,...,N],
        #  [0,1,3,....N],
        #  .............,
        #  [0,1,2,...,N-1]]
        self.nfc = crange + F.cast(self.nnc <= crange, ms.int32)

        crange1 = crange + 1
        # the matrix for index range (A*N)
        # [[1,2,3,...,N],
        #  [1,2,3,...,N],
        #  [2,2,3,....N],
        #  [3,3,3,....N],
        #  .............,
        #  [N,N,N,...,N]]
        self.mat_idx = F.select(crange1 > self.nnc, crange1, self.nnc)
Пример #26
0
    def construct(self, logits, labels):
        logits = self.transpose(logits, (0, 2, 3, 1))
        logits = self.reshape(logits, (-1, self.num))
        labels = F.cast(labels, mstype.int32)
        labels = self.reshape(labels, (-1, ))
        one_hot_labels = self.one_hot(labels)
        losses = self.cross_entropy(logits, one_hot_labels)[0]
        weights = self.cast(self.not_equal(labels, self.ignore_label),
                            mstype.float32) * self.loss_weight
        weighted_losses = self.mul(losses, weights)
        loss = self.reduce_sum(weighted_losses, (0, ))
        zeros = self.fill(mstype.float32, self.shape(weights), 0.0)
        ones = self.fill(mstype.float32, self.shape(weights), 1.0)
        present = self.select(self.equal(weights, zeros), zeros, ones)
        present = self.reduce_sum(present, (0, ))

        zeros = self.fill(mstype.float32, self.shape(present), 0.0)
        min_control = self.fill(mstype.float32, self.shape(present), 1.0)
        present = self.select(self.equal(present, zeros), min_control, present)
        loss = loss / present
        return loss
Пример #27
0
    def construct(self, x1, x2, y):
        F.same_type_shape(x1, x2)
        _check_reduced_shape_valid(F.shape(x1), F.shape(y), (1,), self.cls_name)
        # if target > 0, 1-cosine(x1, x2)
        # else, max(0, cosine(x1, x2)-margin)
        np_eps = const_utils.get_np_eps(F.dtype(x1))
        eps = F.cast(np_eps, F.dtype(x1))
        prod_sum = self.reduce_sum(x1 * x2, (1,))
        square1 = self.reduce_sum(F.square(x1), (1,)) + eps
        square2 = self.reduce_sum(F.square(x2), (1,)) + eps
        denom = F.sqrt(square1 * square2)
        cosine = prod_sum / denom

        pos_value = 1.0 - cosine
        neg_value = self.maximum(cosine - self.margin, 0.0)
        zeros = F.zeros_like(cosine)
        pos_part = F.select(y == 1, pos_value, zeros)
        neg_part = F.select(y == -1, neg_value, zeros)
        output_unreduced = pos_part + neg_part

        return self.get_loss(output_unreduced)
    def construct(self, logits, label, input_mask):
        logits = F.cast(logits, mstype.float32)
        _, logit_max = self.max(logits)
        logit_sub = self.sub(logits, logit_max)
        logit_exp = self.exp(logit_sub)
        exp_sum = self.sum(logit_exp, -1)
        exp_sum = P.Reshape()(exp_sum, (F.shape(exp_sum)[0], 1))
        softmax_result = self.div(logit_exp, exp_sum)
        log_softmax_result = self.log(self.add(softmax_result, self.eps_const))
        label = P.Reshape()(label, (-1,))
        one_hot_label = self.onehot(label, self.vocab_size, self.on_value,
                                    self.off_value)
        loss = self.mul(log_softmax_result, one_hot_label)
        loss_unsum = self.neg(loss)
        loss_reduce = self.sum(loss_unsum, -1)
        input_mask = P.Reshape()(input_mask, (-1,))
        numerator = self.sum2(self.mul2(loss_reduce, input_mask))

        denominator = self.add2(
            self.sum2(input_mask),
            P.Cast()(F.tuple_to_array((1e-5,)), mstype.float32))
        loss = self.div2(numerator, denominator)
        return loss
Пример #29
0
    def __init__(self, tot_atoms):
        super().__init__()
        # tot_atoms: A
        # tot_neigh: N =  A - 1
        tot_neigh = tot_atoms - 1
        arange = nn.Range(tot_atoms)
        nrange = nn.Range(tot_neigh)

        self.ones = P.Ones()
        self.aones = self.ones((tot_atoms), ms.int32)
        self.nones = self.ones((tot_neigh), ms.int32)
        self.eaones = F.expand_dims(self.aones, -1)

        # neighbors for no connection (A*N)
        # [[0,0,...,0],
        #  [1,1,...,1],
        #  ...........,
        #  [N,N,...,N]]
        self.nnc = F.expand_dims(arange(), -1) * self.nones

        # copy of the index range (A*N)
        # [[0,1,...,N-1],
        #  [0,1,...,N-1],
        #  ...........,
        #  [0,1,...,N-1]]
        exrange = self.ones((tot_atoms, 1), ms.int32) * nrange()

        # neighbors for full connection (A*N)
        # [[1,2,3,...,N],
        #  [0,2,3,...,N],
        #  [0,1,3,....N],
        #  .............,
        #  [0,1,2,...,N-1]]
        self.nfc = exrange + F.cast(self.nnc <= exrange, ms.int32)

        self.ar0 = nn.Range(0, tot_neigh)()
        self.ar1 = nn.Range(1, tot_atoms)()
Пример #30
0
def _tensors_allreduce_with_sparse(degree, mean, allgather, allreduce,
                                   allreduce_filter, grad):
    """
    Apply allgather on gradient instead of allreduce for sparse feature.
    Allgather is a communication operation used for distributed deep learning.

    Args:
        degree (int): The mean coefficient.
        mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients.
        allgather (Primitive): The communication operator for sparse gradients.
        allreduce (Primitive): The communication operator for gradients.
        allreduce_filter (bool): When it is true, allgather would apply.
        grad (tuple): The indices, gradient tensor and tensor_shape before operation.

    Returns:
        RowTensor, the gradient after operation.
    """
    if allreduce_filter:
        indices = allgather(grad.indices)
        dout = allgather(grad.values)
        if mean:
            dout = F.tensor_mul(dout, F.cast(degree, F.dtype(dout)))
        grad = RowTensor(indices, dout, grad.dense_shape)
    return grad