def __init__(self, in_channels, out_channels, weight_init='normal', bias_init='zeros', damping=0.03, loss_scale=1, frequency=278, has_bias=True, activation=None): super(Dense_Thor, self).__init__() self.in_channels = check_int_positive(in_channels) self.out_channels = check_int_positive(out_channels) self.has_bias = check_bool(has_bias) self.thor = True if isinstance(weight_init, Tensor): if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \ weight_init.shape[1] != in_channels: raise ValueError("weight_init shape error") self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight") if self.has_bias: if isinstance(bias_init, Tensor): if bias_init.dim() != 1 or bias_init.shape[0] != out_channels: raise ValueError("bias_init shape error") self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias") self.matmul = P.MatMul(transpose_b=True) self.bias_add = P.BiasAdd() self.activation = get_activation(activation) self.activation_flag = self.activation is not None self.matrix_A_inv = Parameter(Tensor(np.zeros([128, 128, 16, 16]).astype(np.float16)), name='matrix_A_inv', requires_grad=False) self.matrix_G_inv = Parameter(Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16)), name="matrix_G_inv", requires_grad=False) self.fake_G = Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16)) self.matmul = P.MatMul(transpose_b=True) self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.matrix_combine = P.CusMatrixCombine() self.cholesky = P.CusCholeskyTrsm() self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) self.mul = P.Mul() self.cast = P.Cast() self.damping = Tensor(damping) self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.vector_matmul = P.CusBatchMatMul() self.pad = P.Pad(((0, 24), (0, 24))) self.pad1 = P.Pad(((0, 8), (0, 8))) self.slice = P.Slice() self.gather = P.GatherV2() self.assignadd = P.AssignAdd() self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False) self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False) self.fused_abs_max1 = P.CusFusedAbsMax1([1000, 1000]) self.fused_abs_max2 = P.CusFusedAbsMax1() self.log = P.Log() self.exp = P.Exp() self.dampingA = Tensor(np.identity(2048), mstype.float32) self.dampingG = Tensor(np.identity(1024), mstype.float32) self.add = P.TensorAdd() self.sqrt = P.Sqrt() self.getG = P.InsertGradientOf(self.save_gradient)
def __init__(self, in_channels, out_channels, kernel_size, stride=1, pad_mode='same', padding=0, dilation=1, group=1, data_format='NCHW', has_bias=False, weight_init='normal', damping=0.03, loss_scale=1, frequency=278, bias_init='zeros'): self.thor = True ksizes = (1, kernel_size, kernel_size, 1) self.hw = kernel_size * kernel_size strides = (1, stride, stride, 1) kernel_size = twice(kernel_size) super(Conv2d_Thor, self).__init__( in_channels, out_channels, kernel_size, stride, pad_mode, padding, dilation, group, data_format, has_bias, weight_init, bias_init, ) self.conv2d = P.Conv2D(out_channel=self.out_channels, kernel_size=self.kernel_size, mode=1, pad_mode=self.pad_mode, pad=self.padding, stride=self.stride, dilation=self.dilation, group=self.group ) self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides) self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.matrix_combine = P.CusMatrixCombine() self.cholesky = P.CusCholeskyTrsm() self.transpose02314 = P.CusTranspose02314() self.matrix_A_dim = self.in_channels * self.kernel_size[0] * self.kernel_size[1] self.matrix_G_dim = self.out_channels self.matrix_A_device_shape, self.matrix_A_device_dim = caculate_device_shape(self.matrix_A_dim, self.in_channels, True) self.matrix_G_device_shape, self.matrix_G_device_dim = caculate_device_shape(self.matrix_G_dim, self.in_channels, False) self.matrix_A_device_temp_shape = ( self.matrix_A_device_shape[0], self.matrix_A_device_shape[2], self.matrix_A_device_shape[1], self.matrix_A_device_shape[3]) self.matrix_G_device_temp_shape = ( self.matrix_G_device_shape[0], self.matrix_G_device_shape[2], self.matrix_G_device_shape[1], self.matrix_G_device_shape[3]) self.matrix_A_inv = Parameter( Tensor(np.reshape(np.identity(self.matrix_A_device_dim).astype(np.float16), self.matrix_A_device_shape)), name='matrix_A_inv', requires_grad=False) self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False) self.matrix_G_inv = Parameter( Tensor(np.reshape(np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape)), name="matrix_G_inv", requires_grad=False) self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False) self.fake_G = Tensor( np.reshape(np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape)) self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) self.mul = P.Mul() self.cast = P.Cast() self.damping = Tensor(damping) self.vector_matmul = P.CusBatchMatMul() self.diag_block_dim = 128 self.channels_slice_flag = False if self.in_channels % C0 != 0: self.channels_slice_flag = True self.padA_flag = False if (self.matrix_A_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_A_dim \ and self.matrix_A_dim > self.diag_block_dim: self.padA_flag = True pad_dim = self.diag_block_dim - self.matrix_A_dim % self.diag_block_dim self.padA = P.Pad(((0, pad_dim), (0, pad_dim))) self.device_shape_pad_flag = False if self.matrix_A_dim != self.matrix_A_device_dim: self.device_shape_pad_flag = True self.device_shape_pad = P.Pad(((0, 0), (0, C0 - self.in_channels), (0, 0), (0, C0 - self.in_channels))) self.slice = P.Slice() self.gather = P.GatherV2() self.freq = Tensor(frequency, mstype.int32) self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.axis = 0 dampingA_dim = self.matrix_A_dim if (self.matrix_A_dim % self.diag_block_dim) != 0 and self.matrix_A_dim > self.diag_block_dim: dampingA_dim = (self.matrix_A_dim // self.diag_block_dim + 1) * self.diag_block_dim dampingG_dim = self.matrix_G_dim if (self.matrix_G_dim % self.diag_block_dim) != 0 and self.matrix_G_dim > self.diag_block_dim: dampingG_dim = (self.matrix_G_dim // self.diag_block_dim + 1) * self.diag_block_dim self.dampingA = Tensor(np.identity(dampingA_dim), mstype.float32) self.dampingG = Tensor(np.identity(dampingG_dim), mstype.float32) self.fused_abs_max1 = P.CusFusedAbsMax1([self.matrix_A_dim, self.matrix_A_dim]) self.fused_abs_max2 = P.CusFusedAbsMax1() self.log = P.Log() self.exp = P.Exp() self.sqrt = P.Sqrt() self.getG = P.InsertGradientOf(self.save_gradient)
def __init__(self, in_channels, out_channels, weight_init='normal', bias_init='zeros', damping=0.03, loss_scale=1, frequency=100, has_bias=False, activation=None, batch_size=12): super(Dense_Thor, self).__init__() self.in_channels = Validator.check_positive_int(in_channels) self.out_channels = Validator.check_positive_int(out_channels) self.has_bias = Validator.check_bool(has_bias) self.thor = True if isinstance(weight_init, Tensor): if weight_init.dim() != 2 or weight_init.shape()[0] != out_channels or \ weight_init.shape()[1] != in_channels: raise ValueError("weight_init shape error") self.weight = Parameter(initializer(weight_init, [out_channels, in_channels])) if self.has_bias: if isinstance(bias_init, Tensor): if bias_init.dim() != 1 or bias_init.shape()[0] != out_channels: raise ValueError("bias_init shape error") self.bias = Parameter(initializer(bias_init, [out_channels])) self.matmul = P.MatMul(transpose_b=True) self.bias_add = P.BiasAdd() self.activation = get_activation(activation) self.activation_flag = self.activation is not None self.matrix_A_inv = Parameter(Tensor(np.zeros([in_channels, in_channels]).astype(np.float16)), requires_grad=False) self.matrix_G_inv = Parameter(Tensor(np.zeros([out_channels, out_channels]).astype(np.float16)), requires_grad=False) self.fake_G = Tensor(np.zeros([out_channels, out_channels]).astype(np.float16)) self.matmul = P.MatMul(transpose_b=True) self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.matrix_combine = P.CusMatrixCombine() self.cholesky = P.CusCholeskyTrsm() self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) self.mul = P.Mul() self.cast = P.Cast() self.damping = damping self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.vector_matmul = P.CusBatchMatMul() self.gather = P.GatherV2() self.assignadd = P.AssignAdd() self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.abs = P.Abs() self.reduce_max = P.ReduceMax(keep_dims=False) self.log = P.Log() self.exp = P.Exp() self.dampingA = Tensor(np.identity(in_channels), mstype.float32) self.dampingG = Tensor(np.identity(out_channels), mstype.float32) self.sqrt = P.Sqrt() self.getG = P.InsertGradientOf(self.save_gradient) self.batch_size = batch_size
def __init__( self, vocab_size, embedding_size, embedding_shape, use_one_hot_embeddings=False, initializer_range=0.02, name='embedding_table', batch_size=12, damping=0.03, loss_scale=1, frequency=100, ): super(Embedding_Thor, self).__init__() self.vocab_size = vocab_size self.use_one_hot_embeddings = use_one_hot_embeddings self.embedding_table = Parameter(initializer( TruncatedNormal(initializer_range), [vocab_size, embedding_size]), name=name) self.thor = True self.expand = P.ExpandDims() self.shape_flat = (-1, ) self.gather = P.GatherV2() self.one_hot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) self.array_mul = P.MatMul() self.reshape = P.Reshape() self.em_shape = tuple(embedding_shape) self.shape = P.Shape() self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.matrix_A_inv = Parameter(Tensor( np.zeros([vocab_size]).astype(np.float16)), name='matrix_A_inv', requires_grad=False) self.matrix_G_inv = Parameter(Tensor( np.zeros([embedding_size, embedding_size]).astype(np.float16)), name="matrix_G_inv", requires_grad=False) self.fake_G = Tensor( np.zeros([embedding_size, embedding_size]).astype(np.float16)) self.dampingA = Tensor(np.ones([vocab_size]).astype(np.float32)) self.dampingG = Tensor(np.identity(embedding_size), mstype.float32) self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.damping = damping self.gather = P.GatherV2() self.sqrt = P.Sqrt() self.mul = P.Mul() self.cast = P.Cast() self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.vector_matmul = P.CusBatchMatMul() self.cholesky = P.CusCholeskyTrsm() self.matrix_combine = P.CusMatrixCombine() self.reduce_sum = P.ReduceSum(keep_dims=False) self.inv = P.Inv() self.getG = P.InsertGradientOf(self.save_gradient) self.batch_size = batch_size
def __init__(self, in_channels, out_channels, weight_init='normal', bias_init='zeros', has_bias=True, activation=None): super(Dense_Thor, self).__init__() self.thor = True self.in_channels = Validator.check_positive_int(in_channels) self.out_channels = Validator.check_positive_int(out_channels) self.has_bias = Validator.check_bool(has_bias) if isinstance(weight_init, Tensor): if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \ weight_init.shape[1] != in_channels: raise ValueError("Weight init shape error.") self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight") self.bias = None if self.has_bias: if isinstance(bias_init, Tensor): if bias_init.dim() != 1 or bias_init.shape[0] != out_channels: raise ValueError("Bias init shape error.") self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias") self.bias_add = P.BiasAdd() self.matmul = P.MatMul(transpose_b=True) self.activation = get_activation(activation) self.activation_flag = self.activation is not None self.matrix_A = Parameter(Tensor( np.zeros([in_channels, in_channels]).astype(np.float32)), name='matrix_A', requires_grad=False) self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.mul = P.Mul() self.is_Ascend = True if context.get_context("device_target") == "Ascend": if out_channels == 1001: self.matrix_G = Parameter(Tensor( np.zeros([1024, 1024]).astype(np.float32)), name='matrix_G', requires_grad=False) self.pad = P.Pad(((0, 23), (0, 23))) self.pad1 = P.Pad(((0, 7), (0, 7))) self.slice = P.Slice() self.add = P.TensorAdd() else: self.matrix_G = Parameter(Tensor( np.eye(out_channels).astype(np.float32)), name="matrix_G", requires_grad=False) self.abs = P.Abs() self.reduce_max = P.ReduceMax(keep_dims=False) self.neg = P.Neg() self.reduce_sum = P.ReduceSum() self.matmul = P.MatMul(transpose_b=True) self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.cast = P.Cast() self.is_nsp_layer = (out_channels == 2) else: self.is_Ascend = False self.matrix_G = Parameter(Tensor( np.eye(out_channels).astype(np.float32)), name="matrix_G", requires_grad=False) self.cube_matmul = P.MatMul(transpose_a=True) self.getG = P.InsertGradientOf(self.save_gradient)
def __init__(self, in_channels, out_channels, kernel_size, stride=1, pad_mode='same', padding=0, dilation=1, group=1, has_bias=False, weight_init='normal', bias_init='zeros'): kernel_size = twice(kernel_size) stride = twice(stride) self._dilation = dilation dilation = twice(dilation) super(Conv2d_Thor, self).__init__(in_channels, out_channels, kernel_size, stride, pad_mode, padding, dilation, group, has_bias, weight_init, bias_init) self.conv2d = P.Conv2D(out_channel=self.out_channels, kernel_size=self.kernel_size, mode=1, pad_mode=self.pad_mode, pad=self.padding, stride=self.stride, dilation=self.dilation, group=self.group) self._init_depthwise_conv2d(weight_init) self.bias_add = P.BiasAdd() self.thor = True self.hw = kernel_size[0] * kernel_size[1] self.matrix_A_dim = self.in_channels * self.kernel_size[ 0] * self.kernel_size[1] self.matrix_G_dim = self.out_channels self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.cast = P.Cast() self.A_normalizer = Parameter(initializer(0, [1], mstype.float32), name="A_normalizer", requires_grad=False) self.G_normalizer = Parameter(initializer(0, [1], mstype.float32), name="G_normalizer", requires_grad=False) self.is_Ascend = True if context.get_context("device_target") == "Ascend": ksizes = (1, kernel_size[0], kernel_size[1], 1) strides = (1, stride[0], stride[1], 1) self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides) self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.transpose02314 = P.CusTranspose02314() dampingA_dim = self.matrix_A_dim self.diag_block_dim = 128 if (self.matrix_A_dim % self.diag_block_dim ) != 0 and self.matrix_A_dim > self.diag_block_dim: dampingA_dim = (self.matrix_A_dim // self.diag_block_dim + 1) * self.diag_block_dim dampingG_dim = self.matrix_G_dim if (self.matrix_G_dim % self.diag_block_dim ) != 0 and self.matrix_G_dim > self.diag_block_dim: dampingG_dim = (self.matrix_G_dim // self.diag_block_dim + 1) * self.diag_block_dim self.matrix_A_cov = Parameter(Tensor( np.zeros([dampingA_dim, dampingA_dim]).astype(np.float32)), name='matrix_A', requires_grad=False) self.matrix_G_cov = Parameter(Tensor( np.zeros([dampingG_dim, dampingG_dim]).astype(np.float32)), name='matrix_G', requires_grad=False) self.channels_slice_flag = False self.C0 = 16 if self.in_channels % self.C0 != 0: self.channels_slice_flag = True self.padA_flag = False if (self.matrix_A_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_A_dim \ and self.matrix_A_dim > self.diag_block_dim: self.padA_flag = True pad_dim = self.diag_block_dim - self.matrix_A_dim % self.diag_block_dim self.padA = P.Pad(((0, pad_dim), (0, pad_dim))) self.slice = P.Slice() else: self.is_Ascend = False self.img2col = P.Im2Col(kernel_size=kernel_size, stride=stride, pad_mode="same") self.matmul = P.MatMul(transpose_b=True) self.reduce_mean = P.ReduceMean(keep_dims=False) self.matrix_A_cov = Parameter(Tensor( np.zeros([self.matrix_A_dim, self.matrix_A_dim]).astype(np.float32)), name='matrix_A', requires_grad=False) self.matrix_G_cov = Parameter(Tensor( np.zeros([self.matrix_G_dim, self.matrix_G_dim]).astype(np.float32)), name='matrix_G', requires_grad=False) self.getG = P.InsertGradientOf(self.save_gradient)