def __init__(self, in_channels=None, return_2d=False): super(LayerNorm, self).__init__() self.return_2d = return_2d self.layer_norm = nn.LayerNorm((in_channels, )) self.cast = P.Cast() self.get_dtype = P.DType() self.reshape = P.Reshape() self.get_shape = P.Shape()
def __init__(self, config: Callable[..., None]) -> None: super().__init__() self.dense = nn.Dense( config.hidden_size, config.hidden_size, weight_init=TruncatedNormal(config.initializer_range), ) self.transform_act_fn = get_activation(config.hidden_act) self.layer_norm = nn.LayerNorm((config.hidden_size, ), epsilon=1e-12)
def __init__(self, in_channels, out_channels, initializer_range=0.02, dropout_prob=0.1, compute_type=mstype.float32): super(BertOutput, self).__init__() self.dense = nn.Dense(in_channels, out_channels, weight_init=TruncatedNormal(initializer_range)).to_float(compute_type) self.dropout = nn.Dropout(1 - dropout_prob) self.add = P.TensorAdd() self.is_gpu = context.get_context('device_target') == "GPU" if self.is_gpu: self.layernorm = nn.LayerNorm((out_channels,)).to_float(mstype.float32) self.compute_type = compute_type else: self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type) self.cast = P.Cast()
def __init__(self, embedding_size, embedding_shape, use_relative_positions=False, use_token_type=False, token_type_vocab_size=16, use_one_hot_embeddings=False, initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): super(EmbeddingPostprocessor, self).__init__() self.use_token_type = use_token_type self.token_type_vocab_size = token_type_vocab_size self.use_one_hot_embeddings = use_one_hot_embeddings self.max_position_embeddings = max_position_embeddings self.token_type_embedding = Embedding_Thor( vocab_size=token_type_vocab_size, embedding_size=embedding_size, embedding_shape=embedding_shape, use_one_hot_embeddings=use_one_hot_embeddings, initializer_range=initializer_range, name='embedding_table', is_expand=False, batch_size=batch_size, damping=damping, loss_scale=loss_scale, frequency=1) self.shape_flat = (-1, ) self.one_hot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.1, mstype.float32) self.array_mul = P.MatMul() self.reshape = P.Reshape() self.shape = tuple(embedding_shape) self.dropout = nn.Dropout(1 - dropout_prob) self.gather = P.GatherV2() self.use_relative_positions = use_relative_positions self.slice = P.StridedSlice() _, seq, width = self.shape position_embedding_shape = [1, seq, width] self.full_position_embedding = Embedding_Thor( vocab_size=max_position_embeddings, embedding_size=embedding_size, embedding_shape=position_embedding_shape, use_one_hot_embeddings=use_one_hot_embeddings, initializer_range=initializer_range, name='full_position_embeddings', is_expand=False, batch_size=batch_size, damping=damping, loss_scale=loss_scale, frequency=1) self.position_ids = Tensor( np.arange(seq).reshape(-1, seq).astype(np.int32)) self.layernorm = nn.LayerNorm((embedding_size, ))
def __init__(self, dim): super().__init__() self.layer_norm = nn.LayerNorm((dim, ), -1, -1) self.xg2q = Dense(dim, dim, has_bias=False) self.xg2k = Dense(dim, dim, has_bias=False) self.xg2v = Dense(dim, dim, has_bias=False) self.mul = P.Mul() self.concat = P.Concat(-2)
def __init__(self, 词库总数, 向量维度, 层数, 头数, 丢弃率,辞数,最大长度=1024): super(多层解码, self).__init__() self.N = 层数 self.embed = 词向量印刻(词库总数, 向量维度) self.embedP = 词向量印刻(最大长度, 向量维度) self.decoders = nn.CellList([解码层(向量维度, 头数, 丢弃率) for i in range(层数)]) self.norm = nn.LayerNorm((向量维度,), epsilon=1e-6) a = [i for i in range(辞数)] b = np.array(a).reshape(1, 辞数) self.po = Tensor(b, mindspore.int32) self.dropout = nn.Dropout(1 - 丢弃率)
def __init__(self, config, layer_idx): super(Block, self).__init__() scale = 1 / math.sqrt(2.0 * config.num_layers) if config.self_layernorm: self.layernorm1 = LayerNorm((config.embedding_size,), config.dp).to_float(mstype.float32) self.layernorm2 = LayerNorm((config.embedding_size,), config.dp).to_float(mstype.float32) else: self.layernorm1 = nn.LayerNorm( (config.embedding_size,)).to_float(mstype.float32) self.layernorm1.layer_norm.shard(((config.dp, 1, 1), (1,), (1,))) self.layernorm2 = nn.LayerNorm( (config.embedding_size,)).to_float(mstype.float32) self.layernorm2.layer_norm.shard(((config.dp, 1, 1), (1,), (1,))) self.attention = Attention(config, scale, layer_idx) self.output = Output(config, scale) self.post_layernorm_residual = config.post_layernorm_residual self.add = P.TensorAdd().shard(((config.dp, 1, 1), (config.dp, 1, 1))) self.add_last = P.TensorAdd().shard(((config.dp, 1, 1), (config.dp, 1, 1))) self.add_last.recompute(False) self.dtype = config.compute_dtype
def __init__(self, in_channels, out_channels, initializer_range=0.02, dropout_prob=0.1, compute_type=mstype.float32): super(BertOutput, self).__init__() self.dense = nn.Dense(in_channels, out_channels, weight_init=TruncatedNormal(initializer_range)).to_float(compute_type) self.dropout = nn.Dropout(1 - dropout_prob) self.add = P.TensorAdd() self.layernorm = nn.LayerNorm(out_channels).to_float(compute_type) self.cast = P.Cast()
def __init__(self, config): super(GPT_Model, self).__init__() self.get_attention_mask = AttentionMask(config) self.word_embedding = EmbeddingLookup(config) self.position_embedding = nn.Embedding( config.seq_length, config.embedding_size, embedding_table=TruncatedNormal(0.02)) self.blocks = nn.CellList() for i in range(config.num_layers): self.blocks.append(Block(config, i + 1)) self.layernorm = nn.LayerNorm( (config.embedding_size, )).to_float(config.compute_dtype) self.use_past = config.use_past self.past = tuple([None] * config.num_layers) self.num_layers = config.num_layers
def __init__(self, in_channels, out_channels, initializer_range=0.02, dropout_prob=0.1, compute_type=mstype.float32, enable_fused_layernorm=False): super(BertOutput, self).__init__() self.dense = nn.Dense(in_channels, out_channels, weight_init=TruncatedNormal(initializer_range)).to_float(compute_type) self.dropout = nn.Dropout(1 - dropout_prob) self.dropout_prob = dropout_prob self.add = ops.TensorAdd() if compute_type == mstype.float16: self.layernorm = FusedLayerNorm((out_channels,), use_batch_norm=enable_fused_layernorm).to_float(compute_type) else: self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type) self.cast = ops.Cast()
def __init__(self, config: Callable[..., None]) -> None: super().__init__() self.token_embeddings = nn.Embedding( config.vocab_size, config.hidden_size, embedding_table=TruncatedNormal(config.initializer_range), padding_idx=0, ) self.position_embeddings = nn.Embedding( config.max_position_embeddings, config.hidden_size, embedding_table=TruncatedNormal(config.initializer_range), ) self.token_type_embeddings = nn.Embedding( config.type_vocab_size, config.hidden_size, embedding_table=TruncatedNormal(config.initializer_range), ) self.layer_norm = nn.LayerNorm((config.hidden_size, ), epsilon=1e-12) self.dropout = nn.Dropout(1.0 - config.hidden_dropout_prob)
def __init__(self, config): super(PANGUALPHA_ModelPipeline, self).__init__() self.pangu_alpha_embedding = PANGUALPHA_EmbeddingPipeLine(config).set_comm_fusion(1) self.pangu_alpha_embedding.stage = 0 self.pangu_alpha_mask = PANGUALPHA_Mask(config) self.blocks = nn.CellList() dropout_recompute = False self.top_query_embedding = nn.Embedding(config.seq_length, config.embedding_size, embedding_table=TruncatedNormal(0.02)) self.top_query_embedding.gather.shard(((1, 1), (config.dp,))) self.top_query_embedding.expand.shard(((config.dp, 1),)) for i in range(config.num_layers): if i == config.num_layers - 1: self.top_query_embedding.set_comm_fusion(2) self.top_query_embedding.stage = i * config.stage_num // config.num_layers per_block = QueryLayer(config).set_comm_fusion(2) else: per_block = Block(config, i + 1).set_comm_fusion(2) per_block.stage = i * config.stage_num // config.num_layers per_block.recompute() self.blocks.append(per_block) if not dropout_recompute: per_block.attention.dropout.dropout_gen_mask.recompute(False).add_prim_attr("_side_effect", True) per_block.attention.prob_dropout.dropout_gen_mask.recompute(False).add_prim_attr("_side_effect", True) per_block.output.dropout.dropout_gen_mask.recompute(False).add_prim_attr("_side_effect", True) if config.self_layernorm: self.layernorm = LayerNorm((config.embedding_size,), config.dp).to_float(mstype.float32) else: self.layernorm = nn.LayerNorm( (config.embedding_size,)).to_float(mstype.float32) self.layernorm.layer_norm.shard(((config.dp, 1, 1), (1,), (1,))) self.layernorm.set_comm_fusion(2) #self.layernorm.set_comm_fusion(3) self.layernorm.stage = config.stage_num - 1 self.use_past = config.use_past self.past = tuple([None] * config.num_layers) self.dtype = config.compute_dtype self.num_layers = config.num_layers
def __init__(self, config, is_training, num_tokens, dropout_prob=0.0, use_one_hot_embeddings=False): super(BertPoetryModel, self).__init__() self.bert = BertModel(config, is_training, use_one_hot_embeddings) self.num_tokens = num_tokens idx = np.arange(config.seq_length) mask = idx[None, :] <= idx[:, None] self.mask = Tensor([mask], mstype.float32) self.MLM_Dense = nn.Dense(config.hidden_size, config.hidden_size,\ has_bias=True, weight_init=TruncatedNormal(0.02),\ activation='gelu').to_float(mstype.float16) self.layer_norm = nn.LayerNorm((config.hidden_size,)) self.matmul = ops.MatMul(transpose_b=True) self.biasadd = Parameter(initializer('zero', self.num_tokens), name='MLM_output_biasadd') self.softmax = ops.Softmax(axis=-1) self.seq_length = config.seq_length self.hidden_size = config.hidden_size self.cast = ops.Cast() self.reshape = ops.Reshape() self.batch_matmul = ops.BatchMatMul() ones = np.ones(shape=(config.batch_size, config.seq_length, config.seq_length)) self.lower_triangle_mask = Tensor(np.tril(ones), dtype=mstype.float32) self.multiply = ops.Mul()
def __init__(self, use_relative_positions, embedding_size, embedding_shape, use_token_type=False, token_type_vocab_size=16, use_one_hot_embeddings=False, initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): super(EmbeddingPostprocessor, self).__init__() self.use_token_type = use_token_type self.token_type_vocab_size = token_type_vocab_size self.use_one_hot_embeddings = use_one_hot_embeddings self.max_position_embeddings = max_position_embeddings self.token_type_embedding = nn.Embedding( vocab_size=token_type_vocab_size, embedding_size=embedding_size, use_one_hot=use_one_hot_embeddings) self.shape_flat = (-1, ) self.one_hot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.1, mstype.float32) self.array_mul = P.MatMul() self.reshape = P.Reshape() self.shape = tuple(embedding_shape) self.dropout = nn.Dropout(1 - dropout_prob) self.gather = P.Gather() self.use_relative_positions = use_relative_positions self.slice = P.StridedSlice() _, seq, _ = self.shape self.full_position_embedding = nn.Embedding( vocab_size=max_position_embeddings, embedding_size=embedding_size, use_one_hot=False) self.layernorm = nn.LayerNorm((embedding_size, )) self.position_ids = Tensor( np.arange(seq).reshape(-1, seq).astype(np.int32)) self.add = P.Add()
def __init__(self, embedding_size, embedding_shape, use_relative_positions=False, use_token_type=False, token_type_vocab_size=16, use_one_hot_embeddings=False, initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): super(EmbeddingPostprocessor, self).__init__() self.use_token_type = use_token_type self.token_type_vocab_size = token_type_vocab_size self.use_one_hot_embeddings = use_one_hot_embeddings self.max_position_embeddings = max_position_embeddings self.embedding_table = Parameter(initializer (TruncatedNormal(initializer_range), [token_type_vocab_size, embedding_size]), name='embedding_table') self.shape_flat = (-1,) self.one_hot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.1, mstype.float32) self.array_mul = P.MatMul() self.reshape = P.Reshape() self.shape = tuple(embedding_shape) self.layernorm = nn.LayerNorm((embedding_size,)) self.dropout = nn.Dropout(1 - dropout_prob) self.gather = P.GatherV2() self.use_relative_positions = use_relative_positions self.slice = P.StridedSlice() self.full_position_embeddings = Parameter(initializer (TruncatedNormal(initializer_range), [max_position_embeddings, embedding_size]), name='full_position_embeddings')
def __init__(self, in_channels, out_channels, initializer_range=0.02, dropout_prob=0.1, compute_type=mstype.float32, enable_fused_layernorm=False): super(BertOutput, self).__init__() self.dense = Dense_Thor(in_channels=in_channels, out_channels=out_channels, weight_init=TruncatedNormal(initializer_range), has_bias=True, bias_init='zeros', damping=damping, loss_scale=loss_scale, frequency=frequency, activation=None, batch_size=batch_size).to_float(compute_type) self.dropout = nn.Dropout(1 - dropout_prob) self.dropout_prob = dropout_prob self.add = P.TensorAdd() self.layernorm = nn.LayerNorm((out_channels, )).to_float(compute_type) self.cast = P.Cast()
def __init__(self, in_channels=None): super(LayerNorm, self).__init__() self.layer_norm = nn.LayerNorm((in_channels, )) self.cast = P.Cast() self.get_dtype = P.DType()
def test_check_layer_norm_1(): x = Tensor(np.ones([20, 5, 10, 10]), mstype.float32) shape1 = x.shape()[1:] m = nn.LayerNorm(shape1, -1, 1) with pytest.raises(NotImplementedError): m(x)
def test_check_layer_norm_3(): x = Tensor(np.ones([20, 5, 10, 10]), mstype.float32) shape1 = (10, 10) m = nn.LayerNorm(shape1, begin_params_axis=2) with pytest.raises(NotImplementedError): m(x)
def __init__(self, hidden_size: int, hidden_dropout_prob: int = 0.1) -> None: super().__init__() self.layer_norm = nn.LayerNorm((hidden_size, ), epsilon=1e-12) self.dropout = nn.Dropout(1.0 - hidden_dropout_prob)
def __init__(self, config): super(PANGUALPHA_Model, self).__init__() self.get_attention_mask = AttentionMask(config) self.word_embedding = EmbeddingLookup(config).set_comm_fusion(1) self.eod_reset = config.eod_reset if config.load_ckpt_path: # Loading the embedding table from the ckpt path: embedding_path = os.path.join(config.load_ckpt_path, 'position_embedding.npy') if os.path.exists(embedding_path): p_table = np.load(embedding_path) position_table_param = Tensor(p_table, mstype.float32) else: raise ValueError(f"{embedding_path} file not exits, please check whether position_embedding file exit.") else: position_table_param = TruncatedNormal(0.02) self.position_embedding = nn.Embedding( config.seq_length, config.embedding_size, embedding_table=position_table_param).set_comm_fusion(1) self.word_embedding.embedding_table.parallel_optimizer = False self.position_embedding.embedding_table.parallel_optimizer = False self.position_embedding.gather.shard(((1, 1), (config.dp,))) self.position_embedding.expand.shard(((config.dp, 1),)) self.blocks = nn.CellList() fusion_group_num = 4 fusion_group_size = config.num_layers // fusion_group_num fusion_group_size = max(fusion_group_size, 1) num_layers = config.num_layers - 1 self.num_layers = num_layers for i in range(num_layers): per_block = Block(config, i + 1).set_comm_fusion(int(i / fusion_group_size) + 2) per_block.recompute() per_block.attention.dropout.dropout_gen_mask.recompute(False) per_block.attention.prob_dropout.dropout_gen_mask.recompute(False) per_block.output.dropout.dropout_gen_mask.recompute(False) per_block.attention.dropout.dropout_gen_mask.add_prim_attr("_side_effect", True) per_block.attention.prob_dropout.dropout_gen_mask.add_prim_attr("_side_effect", True) per_block.output.dropout.dropout_gen_mask.add_prim_attr("_side_effect", True) self.blocks.append(per_block) if config.self_layernorm: self.layernorm = LayerNorm((config.embedding_size,), config.dp).to_float( mstype.float32).set_comm_fusion( int((num_layers - 1) / fusion_group_size) + 2) else: self.layernorm = nn.LayerNorm((config.embedding_size,)).to_float( mstype.float32).set_comm_fusion( int((num_layers - 1) / fusion_group_size) + 2) self.layernorm.layer_norm.shard(((config.dp, 1, 1), (1,), (1,))) self.layernorm.gamma.parallel_optimizer = False self.layernorm.beta.parallel_optimizer = False self.use_past = config.use_past self.past = tuple([None] * config.num_layers) self.add = P.TensorAdd().shard(((config.dp, 1, 1), (config.dp, 1, 1))) self.expand_dims = P.ExpandDims().shard(((config.dp, 1, 1),)) self.dtype = config.compute_dtype self.dropout = nn.Dropout(1 - config.dropout_rate) self.dropout.dropout_gen_mask.shard(((config.dp, 1, 1),)) self.dropout.dropout_do_mask.shard(((config.dp, 1, 1),)) if config.load_ckpt_path: # Loading the embedding table from the ckpt path: embedding_path = os.path.join(config.load_ckpt_path, 'top_query_embedding.npy') if os.path.exists(embedding_path): top_query_table = np.load(embedding_path) top_query_table_param = Tensor(top_query_table, mstype.float32) else: raise ValueError(f"{embedding_path} file not exits, please check whether top_query_embedding file exist.") else: top_query_table_param = TruncatedNormal(0.02) self.top_query_embedding = nn.Embedding(config.seq_length, config.embedding_size, \ embedding_table=top_query_table_param).set_comm_fusion( int((config.num_layers - 1) / fusion_group_num) + 2) self.top_query_embedding.embedding_table.parallel_optimizer = False self.top_query_embedding.gather.shard(((1, 1), (config.dp,))) self.top_query_embedding.expand.shard(((config.dp, 1),)) self.top_query_layer = QueryLayer(config) self.top_query_layer.recompute() self.top_query_layer.output.dropout.dropout_gen_mask.recompute(False) self.top_query_layer.attention.dropout.dropout_gen_mask.recompute(False) self.top_query_layer.attention.prob_dropout.dropout_gen_mask.recompute(False) self.top_query_layer.output.dropout.dropout_gen_mask.add_prim_attr("_side_effect", True) self.top_query_layer.attention.dropout.dropout_gen_mask.add_prim_attr("_side_effect", True) self.top_query_layer.attention.prob_dropout.dropout_gen_mask.add_prim_attr("_side_effect", True) self.top_query_layer.set_comm_fusion(int((config.num_layers - 1) / fusion_group_num) + 2)