def reduce_max_ad_optimized(head, data, axis, keepdims): def get_shape(pld): return [d.value for d in pld.shape] def custom_reduce_max_fdiff(out, inputs, grad, ad_attrs, new_pld_array): data = inputs[0] shape = get_shape(data) max_ = akg.lang.cce.reduce_max(data, axis=axis, keepdims=keepdims) max_broadcast = akg.lang.cce.broadcast(max_, shape) return [ akg.tvm.compute(shape, lambda *indices: akg.tvm.expr.Select( data(*indices) == max_broadcast(*indices), grad(*get_reduced_indices( *indices, axis=axis, keepdims=keepdims)), akg.tvm.const(0, dtype=data.dtype)), name="reduce_max_ad2") ] l = reduce_max.reduce_max(data, axis, keepdims) [dl_ddata ] = akg.differentiate(l, [data], head, None, None, override={l: ([data], custom_reduce_max_fdiff)}) return dl_ddata
def softmax_cross_entropy_with_logits(labels, logits, axis, reduction="mean", scale=1.0): max_logits = reduce_max(logits, axis, keepdims=True, target=utils.CCE) data_sub = sub(logits, max_logits, target=utils.CCE) akg.register_variables("minus_max", [logits], data_sub) data_exp = Exp(data_sub, target=utils.CCE) data_expsum = sum(data_exp, axis, keepdims=True, target=utils.CCE) data_expsum_log = log(data_expsum, target=utils.CCE) sub_value = sub(data_sub, data_expsum_log, target=utils.CCE) neg_labels = neg(labels, target=utils.CCE) cross_entropy = mul(neg_labels, sub_value, target=utils.CCE) # backprop: prob - labels, where prob = softmax(logits) prob = Exp(sub_value, target=utils.CCE) backprop = sub(prob, labels, target=utils.CCE) if reduction.lower() == "none": loss = sum_v2(cross_entropy, axis, keepdims=True) elif reduction.lower() == "mean": loss = sum_v2(cross_entropy, axis=None) factor = logits.shape[0].value loss = loss * akg.tvm.const(1 / factor, logits.dtype) backprop = backprop * akg.tvm.const(1 / factor, logits.dtype) elif reduction.lower() == "sum": loss = sum_v2(cross_entropy, axis=None) else: raise ValueError( "reduction method {0} is not supported".format(reduction)) backprop = akg.topi.multiply(backprop, akg.tvm.const(scale, backprop.dtype)) return loss, backprop
def segment_max(data, segment_ids, num_segments): """ Computes the max value along segment_ids of a akg.tvm.tensor Args: data: akg.tvm.Tensor of type "float16", "float32" segment_ids: akg.tvm.Tensor of type int32, sorted Returns: akg.tvm.Tensor of same shape and type as data """ d_dtype = data.dtype vc_util.ops_dtype_check(d_dtype, vc_util.DtypeForDavinci.ALL_FLOAT) d_shape = [x.value for x in data.shape] vc_util.check_shape(d_shape) s_shape = segment_ids.shape vc_util.check_shape(s_shape) new_segment_ids, idx = gen_ids(segment_ids) output_shape = (1, ) + tuple(d_shape[len(s_shape):]) zero_data = akg.tvm.compute(output_shape, lambda *i: akg.tvm.const(0.0, d_dtype), name="zero") data_list = split.split(data, new_segment_ids) out_n = num_segments out = [] j = 0 for i in range(0, out_n): if i in idx: tmp = reduce_max.reduce_max(data_list[j], 0, True) out.append(tmp) j = j + 1 else: out.append(zero_data) res = concat.concat(out, 0) return res
def unsorted_segment_max(data, segment_ids, num_segments): """ Computes the max value along segment_ids of a akg.tvm.Tensor Args: data: akg.tvm.Tensor of type float16, float32 segment_ids: akg.tvm.Tensor of type int32, shape is a prefix of input_data.shape. num_segments: the number of classes in segment_ids Returns: akg.tvm.Tensor of same type as input_data, """ d_dtype = data.dtype vc_util.ops_dtype_check(d_dtype, vc_util.DtypeForDavinci.ALL_FLOAT) d_shape = [x.value for x in data.shape] vc_util.check_shape(d_shape) s_shape = segment_ids.shape vc_util.check_shape(s_shape) new_segment_ids, idx = gen_ids(segment_ids) output_shape = (1, ) + tuple(d_shape[len(s_shape):]) zero_data = akg.tvm.compute(output_shape, lambda *i: akg.tvm.const(0.0, d_dtype), name="zero") data_list, new_idx = split_new(data, new_segment_ids, idx, num_segments) out = [] j = 0 for i in range(0, num_segments): if i in new_idx: tmp = reduce_max.reduce_max(data_list[j], 0, True) out.append(tmp) j = j + 1 else: out.append(zero_data) res = concat.concat(out, 0) return res
def focal_loss(prediction, target, gamma): """ Calculate loss by focalloss. See Source: <a href="https://arxiv.org/abs/1708.02002">Focal Loss for Dense Object Detection; Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, Piotr Dollár</a> This op fuses activation function (`softmax`) and loss function (`focalloss`) together. .. math:: p = softmax(x) \\ FL(p) = -(1-p)^{\\gamma}log(p) Args: prediction (tvm.tensor.Tensor): The predicted logits for each class, type is float32 or float16 and shape is `(batch_size, num_anchors, num_clases)`, target (tvm.tensor.Tensor): The one-hot encoded classification targets, type is float32, float16 or int32 and shape is `(batch_size, num_anchors, num_classes)`, gamma (float): positive float number. Returns: tvm.tensor.Tensor, has the same type as inputs with shape `(batch_size, num_anchors)`. """ vc_util.check_shape(prediction, length=3, tensor_name="prediction") vc_util.check_shape(target, length=3, tensor_name="target") vc_util.ops_dtype_check(prediction.dtype, vc_util.DtypeForDavinci.ALL_FLOAT) vc_util.ops_dtype_check( target.dtype, [vc_util.DtypeForDavinci.ALL_FLOAT, vc_util.DtypeForDavinci.INT32]) vc_util.check_greater("gamma", "zero", gamma, 0) dim_info, _ = focal_loss_set_dim_func(prediction, target) attrs = {"dim": dim_info} dtype = prediction.dtype if utils.product_is_mini() and dtype == 'float32': prediction = akg.topi.cast(prediction, "float16") target = akg.topi.cast(target, "float16") axis = -1 shape = get_shape(prediction) maxv = reduce_max(prediction, axis=axis, keepdims=True) k1 = akg.tvm.reduce_axis((0, shape[-1]), name="k1") expsum = akg.tvm.compute( shape[:-1], lambda *i: akg.tvm.sum(akg.tvm.exp(prediction(*i, k1) - maxv(*i, 0)), axis=k1), name="expsum") gamma = akg.tvm.const(gamma, prediction.dtype) one = akg.tvm.const(1, prediction.dtype) def cal_focalloss(*i): x = prediction(*i) - maxv(*i[:-1], 0) pred = akg.tvm.exp(x - akg.tvm.log(expsum(*i[:-1]))) # softmax(x) log_p = x - akg.tvm.log(expsum(*i[:-1])) # logsoftmax(x) neg_pred_pow = akg.tvm.exp(akg.tvm.log(one - pred) * gamma) # (1-pred)^gamma loss = akg.tvm.const( -1, prediction.dtype) * target(*i) * neg_pred_pow * log_p return loss loss = akg.tvm.compute(shape, cal_focalloss, name="loss") loss = akg.topi.sum(loss, axis=axis) if utils.product_is_mini() and dtype == 'float32': loss = akg.topi.cast(loss, "float32") return loss, attrs
def reduce_max_ad_optimized_manual_schedule(input_shape, dtype, axis, keepdims, polyhedral=True, attrs=None): def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array): data_ = inputs[0] shape = data_.shape # reduces maximum value for each column max_ = akg.lang.cce.reduce_max(data_, axis=axis, keepdims=True) # copies reduced values to get the original shape max_broadcast = akg.lang.cce.broadcast(max_, shape) # head broadcast is needed to generate correct cce code for the selection operation head_broadcast = akg.tvm.compute( shape, lambda *indices: head_(*get_reduced_indices( *indices, axis=axis, keepdims=keepdims))) # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output max_values_and_zeros = akg.tvm.compute( shape, lambda *indices: akg.tvm.expr.Select( data_(*indices) == max_broadcast(*indices), head_broadcast(*indices), akg.tvm.const(0, dtype='float16')), name="reduce_max_ad2") # cast data back to the original dtype if dtype != 'float16': return [cast(max_values_and_zeros, dtype)] else: return [max_values_and_zeros] # tensor for the input data data = akg.tvm.placeholder(input_shape, dtype, name="input_data") # computation of reduce max # not used on the schedule because this is the diferentiation op l = reduce_max.reduce_max(data, axis, keepdims) # adjoint tensor for the differentiation head = akg.tvm.placeholder(l.shape, name="head", dtype=l.dtype) # cast input data if dtype != 'float16': data_cast = cast(data, "float16") head_cast = cast(head, "float16") else: data_cast = data head_cast = head # override differentiation computation with custom function [dl_ddata] = akg.differentiate( l, [data_cast], head_cast, None, None, override={l: ([data_cast], custom_reduce_max_fdiff)}) # get tensors from custom function if dtype != 'float16': max_values_and_zeros = dl_ddata.op.input_tensors[0] max_broadcast = max_values_and_zeros.op.input_tensors[1] max_ = max_broadcast.op.input_tensors[0] head_broadcast = max_values_and_zeros.op.input_tensors[2] else: max_broadcast = dl_ddata.op.input_tensors[1] max_ = max_broadcast.op.input_tensors[0] head_broadcast = dl_ddata.op.input_tensors[2] # schedule for differetiation operation # inputs: data and head s = akg.tvm.create_schedule([dl_ddata.op]) # cache reads of inputs if dtype != 'float16': head_ub = s.cache_read(head, "local.UB", [head_cast]) data_ub = s.cache_read(data, "local.UB", [data_cast]) else: # no cast operation head_ub = s.cache_read(head_cast, "local.UB", [head_broadcast]) data_ub = s.cache_read(data_cast, "local.UB", [max_, dl_ddata]) # cache write for the output dl_ddata_ub = s.cache_write(dl_ddata, "local.UB") # get tiling attributes if attrs is None: raise Exception('attrs is None') tiling_factors = attrs['tile'] split_iterators = [] assert len(tiling_factors) == len(dl_ddata.shape) # split the final compute and save the iterators for index, factor in enumerate(tiling_factors): split_iterators.append(s[dl_ddata].split(dl_ddata.op.axis[index], factor)) # get iterators iterator1 = split_iterators[0][0] # move computation of when there is a cast if dtype != "float16": s[data_cast].compute_at(s[dl_ddata], iterator1) s[data_cast].set_scope("local.UB") s[head_cast].compute_at(s[dl_ddata], iterator1) s[head_cast].set_scope("local.UB") s[max_values_and_zeros].compute_at(s[dl_ddata], iterator1) s[max_values_and_zeros].set_scope("local.UB") # move cache reads and writes s[data_ub].compute_at(s[dl_ddata], iterator1) s[head_ub].compute_at(s[dl_ddata], iterator1) s[dl_ddata_ub].compute_at(s[dl_ddata], iterator1) # move computation of the diferentiation s[max_].compute_at(s[dl_ddata], iterator1) s[max_].set_scope("local.UB") s[max_broadcast].compute_at(s[dl_ddata], iterator1) s[max_broadcast].set_scope("local.UB") s[head_broadcast].compute_at(s[dl_ddata], iterator1) s[head_broadcast].set_scope("local.UB") with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [head, data, dl_ddata], "cce", name="reduce_max_ad_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "reduce_max_ad_manual_schedule" utils.create_cce(kernel_name, './', source_code) return mod
def reduce_max_ad(head, data, axis, keepdims): b = reduce_max.reduce_max(data, axis, keepdims) _jacs = akg.differentiate(b, [data], head) return _jacs[0]