Exemplo n.º 1
0
 def __init__(self, in_channels=None, return_2d=False):
     super(LayerNorm, self).__init__()
     self.return_2d = return_2d
     self.layer_norm = nn.LayerNorm((in_channels, ))
     self.cast = P.Cast()
     self.get_dtype = P.DType()
     self.reshape = P.Reshape()
     self.get_shape = P.Shape()
Exemplo n.º 2
0
 def __init__(self, config: Callable[..., None]) -> None:
     super().__init__()
     self.dense = nn.Dense(
         config.hidden_size,
         config.hidden_size,
         weight_init=TruncatedNormal(config.initializer_range),
     )
     self.transform_act_fn = get_activation(config.hidden_act)
     self.layer_norm = nn.LayerNorm((config.hidden_size, ), epsilon=1e-12)
Exemplo n.º 3
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 initializer_range=0.02,
                 dropout_prob=0.1,
                 compute_type=mstype.float32):
        super(BertOutput, self).__init__()
        self.dense = nn.Dense(in_channels, out_channels,
                              weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
        self.dropout = nn.Dropout(1 - dropout_prob)
        self.add = P.TensorAdd()
        self.is_gpu = context.get_context('device_target') == "GPU"
        if self.is_gpu:
            self.layernorm = nn.LayerNorm((out_channels,)).to_float(mstype.float32)
            self.compute_type = compute_type
        else:
            self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)

        self.cast = P.Cast()
Exemplo n.º 4
0
 def __init__(self,
              embedding_size,
              embedding_shape,
              use_relative_positions=False,
              use_token_type=False,
              token_type_vocab_size=16,
              use_one_hot_embeddings=False,
              initializer_range=0.02,
              max_position_embeddings=512,
              dropout_prob=0.1):
     super(EmbeddingPostprocessor, self).__init__()
     self.use_token_type = use_token_type
     self.token_type_vocab_size = token_type_vocab_size
     self.use_one_hot_embeddings = use_one_hot_embeddings
     self.max_position_embeddings = max_position_embeddings
     self.token_type_embedding = Embedding_Thor(
         vocab_size=token_type_vocab_size,
         embedding_size=embedding_size,
         embedding_shape=embedding_shape,
         use_one_hot_embeddings=use_one_hot_embeddings,
         initializer_range=initializer_range,
         name='embedding_table',
         is_expand=False,
         batch_size=batch_size,
         damping=damping,
         loss_scale=loss_scale,
         frequency=1)
     self.shape_flat = (-1, )
     self.one_hot = P.OneHot()
     self.on_value = Tensor(1.0, mstype.float32)
     self.off_value = Tensor(0.1, mstype.float32)
     self.array_mul = P.MatMul()
     self.reshape = P.Reshape()
     self.shape = tuple(embedding_shape)
     self.dropout = nn.Dropout(1 - dropout_prob)
     self.gather = P.GatherV2()
     self.use_relative_positions = use_relative_positions
     self.slice = P.StridedSlice()
     _, seq, width = self.shape
     position_embedding_shape = [1, seq, width]
     self.full_position_embedding = Embedding_Thor(
         vocab_size=max_position_embeddings,
         embedding_size=embedding_size,
         embedding_shape=position_embedding_shape,
         use_one_hot_embeddings=use_one_hot_embeddings,
         initializer_range=initializer_range,
         name='full_position_embeddings',
         is_expand=False,
         batch_size=batch_size,
         damping=damping,
         loss_scale=loss_scale,
         frequency=1)
     self.position_ids = Tensor(
         np.arange(seq).reshape(-1, seq).astype(np.int32))
     self.layernorm = nn.LayerNorm((embedding_size, ))
Exemplo n.º 5
0
    def __init__(self, dim):
        super().__init__()

        self.layer_norm = nn.LayerNorm((dim, ), -1, -1)

        self.xg2q = Dense(dim, dim, has_bias=False)
        self.xg2k = Dense(dim, dim, has_bias=False)
        self.xg2v = Dense(dim, dim, has_bias=False)

        self.mul = P.Mul()
        self.concat = P.Concat(-2)
Exemplo n.º 6
0
 def __init__(self, 词库总数, 向量维度, 层数, 头数, 丢弃率,辞数,最大长度=1024):
     super(多层解码, self).__init__()
     self.N = 层数
     self.embed = 词向量印刻(词库总数, 向量维度)
     self.embedP = 词向量印刻(最大长度, 向量维度)
     self.decoders = nn.CellList([解码层(向量维度, 头数, 丢弃率) for i in range(层数)])
     self.norm = nn.LayerNorm((向量维度,), epsilon=1e-6)
     a = [i for i in range(辞数)]
     b = np.array(a).reshape(1, 辞数)
     self.po = Tensor(b, mindspore.int32)
     self.dropout = nn.Dropout(1 - 丢弃率)
 def __init__(self, config, layer_idx):
     super(Block, self).__init__()
     scale = 1 / math.sqrt(2.0 * config.num_layers)
     if config.self_layernorm:
         self.layernorm1 = LayerNorm((config.embedding_size,), config.dp).to_float(mstype.float32)
         self.layernorm2 = LayerNorm((config.embedding_size,), config.dp).to_float(mstype.float32)
     else:
         self.layernorm1 = nn.LayerNorm(
             (config.embedding_size,)).to_float(mstype.float32)
         self.layernorm1.layer_norm.shard(((config.dp, 1, 1), (1,), (1,)))
         self.layernorm2 = nn.LayerNorm(
             (config.embedding_size,)).to_float(mstype.float32)
         self.layernorm2.layer_norm.shard(((config.dp, 1, 1), (1,), (1,)))
     
     self.attention = Attention(config, scale, layer_idx)
     self.output = Output(config, scale)
     self.post_layernorm_residual = config.post_layernorm_residual
     self.add = P.TensorAdd().shard(((config.dp, 1, 1), (config.dp, 1, 1)))
     self.add_last = P.TensorAdd().shard(((config.dp, 1, 1), (config.dp, 1, 1)))
     self.add_last.recompute(False)
     self.dtype = config.compute_dtype
Exemplo n.º 8
0
 def __init__(self,
              in_channels,
              out_channels,
              initializer_range=0.02,
              dropout_prob=0.1,
              compute_type=mstype.float32):
     super(BertOutput, self).__init__()
     self.dense = nn.Dense(in_channels, out_channels,
                           weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
     self.dropout = nn.Dropout(1 - dropout_prob)
     self.add = P.TensorAdd()
     self.layernorm = nn.LayerNorm(out_channels).to_float(compute_type)
     self.cast = P.Cast()
Exemplo n.º 9
0
 def __init__(self, config):
     super(GPT_Model, self).__init__()
     self.get_attention_mask = AttentionMask(config)
     self.word_embedding = EmbeddingLookup(config)
     self.position_embedding = nn.Embedding(
         config.seq_length,
         config.embedding_size,
         embedding_table=TruncatedNormal(0.02))
     self.blocks = nn.CellList()
     for i in range(config.num_layers):
         self.blocks.append(Block(config, i + 1))
     self.layernorm = nn.LayerNorm(
         (config.embedding_size, )).to_float(config.compute_dtype)
     self.use_past = config.use_past
     self.past = tuple([None] * config.num_layers)
     self.num_layers = config.num_layers
Exemplo n.º 10
0
 def __init__(self,
              in_channels,
              out_channels,
              initializer_range=0.02,
              dropout_prob=0.1,
              compute_type=mstype.float32,
              enable_fused_layernorm=False):
     super(BertOutput, self).__init__()
     self.dense = nn.Dense(in_channels, out_channels,
                           weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
     self.dropout = nn.Dropout(1 - dropout_prob)
     self.dropout_prob = dropout_prob
     self.add = ops.TensorAdd()
     if compute_type == mstype.float16:
         self.layernorm = FusedLayerNorm((out_channels,),
                                         use_batch_norm=enable_fused_layernorm).to_float(compute_type)
     else:
         self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
     self.cast = ops.Cast()
Exemplo n.º 11
0
    def __init__(self, config: Callable[..., None]) -> None:
        super().__init__()
        self.token_embeddings = nn.Embedding(
            config.vocab_size,
            config.hidden_size,
            embedding_table=TruncatedNormal(config.initializer_range),
            padding_idx=0,
        )
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings,
            config.hidden_size,
            embedding_table=TruncatedNormal(config.initializer_range),
        )
        self.token_type_embeddings = nn.Embedding(
            config.type_vocab_size,
            config.hidden_size,
            embedding_table=TruncatedNormal(config.initializer_range),
        )

        self.layer_norm = nn.LayerNorm((config.hidden_size, ), epsilon=1e-12)
        self.dropout = nn.Dropout(1.0 - config.hidden_dropout_prob)
Exemplo n.º 12
0
    def __init__(self, config):
        super(PANGUALPHA_ModelPipeline, self).__init__()
        self.pangu_alpha_embedding = PANGUALPHA_EmbeddingPipeLine(config).set_comm_fusion(1)
        self.pangu_alpha_embedding.stage = 0
        self.pangu_alpha_mask = PANGUALPHA_Mask(config)
        self.blocks = nn.CellList()
        dropout_recompute = False
        self.top_query_embedding = nn.Embedding(config.seq_length, config.embedding_size,
                                                embedding_table=TruncatedNormal(0.02))
        self.top_query_embedding.gather.shard(((1, 1), (config.dp,)))
        self.top_query_embedding.expand.shard(((config.dp, 1),))
        for i in range(config.num_layers):
            if i == config.num_layers - 1:
                self.top_query_embedding.set_comm_fusion(2)
                self.top_query_embedding.stage = i * config.stage_num // config.num_layers
                per_block = QueryLayer(config).set_comm_fusion(2)
            else:
                per_block = Block(config, i + 1).set_comm_fusion(2)
            per_block.stage = i * config.stage_num // config.num_layers
            per_block.recompute()
            self.blocks.append(per_block)
            if not dropout_recompute:
                per_block.attention.dropout.dropout_gen_mask.recompute(False).add_prim_attr("_side_effect", True)
                per_block.attention.prob_dropout.dropout_gen_mask.recompute(False).add_prim_attr("_side_effect", True)
                per_block.output.dropout.dropout_gen_mask.recompute(False).add_prim_attr("_side_effect", True)

        if config.self_layernorm:
            self.layernorm = LayerNorm((config.embedding_size,), config.dp).to_float(mstype.float32)
        else:
            self.layernorm = nn.LayerNorm(
                (config.embedding_size,)).to_float(mstype.float32)
            self.layernorm.layer_norm.shard(((config.dp, 1, 1), (1,), (1,)))
        self.layernorm.set_comm_fusion(2)
        #self.layernorm.set_comm_fusion(3)
        self.layernorm.stage = config.stage_num - 1
        self.use_past = config.use_past
        self.past = tuple([None] * config.num_layers)
        self.dtype = config.compute_dtype
        self.num_layers = config.num_layers
Exemplo n.º 13
0
 def __init__(self, config, is_training, num_tokens, dropout_prob=0.0, use_one_hot_embeddings=False):
     super(BertPoetryModel, self).__init__()
     self.bert = BertModel(config, is_training, use_one_hot_embeddings)
     self.num_tokens = num_tokens
     idx = np.arange(config.seq_length)
     mask = idx[None, :] <= idx[:, None]
     self.mask = Tensor([mask], mstype.float32)
     self.MLM_Dense = nn.Dense(config.hidden_size, config.hidden_size,\
                             has_bias=True, weight_init=TruncatedNormal(0.02),\
                             activation='gelu').to_float(mstype.float16)
     self.layer_norm = nn.LayerNorm((config.hidden_size,))
     self.matmul = ops.MatMul(transpose_b=True)
     self.biasadd = Parameter(initializer('zero', self.num_tokens), name='MLM_output_biasadd')
     self.softmax = ops.Softmax(axis=-1)
     self.seq_length = config.seq_length
     self.hidden_size = config.hidden_size
     self.cast = ops.Cast()
     self.reshape = ops.Reshape()
     self.batch_matmul = ops.BatchMatMul()
     ones = np.ones(shape=(config.batch_size, config.seq_length, config.seq_length))
     self.lower_triangle_mask = Tensor(np.tril(ones), dtype=mstype.float32)
     self.multiply = ops.Mul()
Exemplo n.º 14
0
 def __init__(self,
              use_relative_positions,
              embedding_size,
              embedding_shape,
              use_token_type=False,
              token_type_vocab_size=16,
              use_one_hot_embeddings=False,
              initializer_range=0.02,
              max_position_embeddings=512,
              dropout_prob=0.1):
     super(EmbeddingPostprocessor, self).__init__()
     self.use_token_type = use_token_type
     self.token_type_vocab_size = token_type_vocab_size
     self.use_one_hot_embeddings = use_one_hot_embeddings
     self.max_position_embeddings = max_position_embeddings
     self.token_type_embedding = nn.Embedding(
         vocab_size=token_type_vocab_size,
         embedding_size=embedding_size,
         use_one_hot=use_one_hot_embeddings)
     self.shape_flat = (-1, )
     self.one_hot = P.OneHot()
     self.on_value = Tensor(1.0, mstype.float32)
     self.off_value = Tensor(0.1, mstype.float32)
     self.array_mul = P.MatMul()
     self.reshape = P.Reshape()
     self.shape = tuple(embedding_shape)
     self.dropout = nn.Dropout(1 - dropout_prob)
     self.gather = P.Gather()
     self.use_relative_positions = use_relative_positions
     self.slice = P.StridedSlice()
     _, seq, _ = self.shape
     self.full_position_embedding = nn.Embedding(
         vocab_size=max_position_embeddings,
         embedding_size=embedding_size,
         use_one_hot=False)
     self.layernorm = nn.LayerNorm((embedding_size, ))
     self.position_ids = Tensor(
         np.arange(seq).reshape(-1, seq).astype(np.int32))
     self.add = P.Add()
Exemplo n.º 15
0
    def __init__(self,
                 embedding_size,
                 embedding_shape,
                 use_relative_positions=False,
                 use_token_type=False,
                 token_type_vocab_size=16,
                 use_one_hot_embeddings=False,
                 initializer_range=0.02,
                 max_position_embeddings=512,
                 dropout_prob=0.1):
        super(EmbeddingPostprocessor, self).__init__()
        self.use_token_type = use_token_type
        self.token_type_vocab_size = token_type_vocab_size
        self.use_one_hot_embeddings = use_one_hot_embeddings
        self.max_position_embeddings = max_position_embeddings
        self.embedding_table = Parameter(initializer
                                         (TruncatedNormal(initializer_range),
                                          [token_type_vocab_size,
                                           embedding_size]),
                                         name='embedding_table')

        self.shape_flat = (-1,)
        self.one_hot = P.OneHot()
        self.on_value = Tensor(1.0, mstype.float32)
        self.off_value = Tensor(0.1, mstype.float32)
        self.array_mul = P.MatMul()
        self.reshape = P.Reshape()
        self.shape = tuple(embedding_shape)
        self.layernorm = nn.LayerNorm((embedding_size,))
        self.dropout = nn.Dropout(1 - dropout_prob)
        self.gather = P.GatherV2()
        self.use_relative_positions = use_relative_positions
        self.slice = P.StridedSlice()
        self.full_position_embeddings = Parameter(initializer
                                                  (TruncatedNormal(initializer_range),
                                                   [max_position_embeddings,
                                                    embedding_size]),
                                                  name='full_position_embeddings')
Exemplo n.º 16
0
 def __init__(self,
              in_channels,
              out_channels,
              initializer_range=0.02,
              dropout_prob=0.1,
              compute_type=mstype.float32,
              enable_fused_layernorm=False):
     super(BertOutput, self).__init__()
     self.dense = Dense_Thor(in_channels=in_channels,
                             out_channels=out_channels,
                             weight_init=TruncatedNormal(initializer_range),
                             has_bias=True,
                             bias_init='zeros',
                             damping=damping,
                             loss_scale=loss_scale,
                             frequency=frequency,
                             activation=None,
                             batch_size=batch_size).to_float(compute_type)
     self.dropout = nn.Dropout(1 - dropout_prob)
     self.dropout_prob = dropout_prob
     self.add = P.TensorAdd()
     self.layernorm = nn.LayerNorm((out_channels, )).to_float(compute_type)
     self.cast = P.Cast()
Exemplo n.º 17
0
 def __init__(self, in_channels=None):
     super(LayerNorm, self).__init__()
     self.layer_norm = nn.LayerNorm((in_channels, ))
     self.cast = P.Cast()
     self.get_dtype = P.DType()
Exemplo n.º 18
0
def test_check_layer_norm_1():
    x = Tensor(np.ones([20, 5, 10, 10]), mstype.float32)
    shape1 = x.shape()[1:]
    m = nn.LayerNorm(shape1, -1, 1)
    with pytest.raises(NotImplementedError):
        m(x)
Exemplo n.º 19
0
def test_check_layer_norm_3():
    x = Tensor(np.ones([20, 5, 10, 10]), mstype.float32)
    shape1 = (10, 10)
    m = nn.LayerNorm(shape1, begin_params_axis=2)
    with pytest.raises(NotImplementedError):
        m(x)
Exemplo n.º 20
0
 def __init__(self,
              hidden_size: int,
              hidden_dropout_prob: int = 0.1) -> None:
     super().__init__()
     self.layer_norm = nn.LayerNorm((hidden_size, ), epsilon=1e-12)
     self.dropout = nn.Dropout(1.0 - hidden_dropout_prob)
Exemplo n.º 21
0
    def __init__(self, config):
        super(PANGUALPHA_Model, self).__init__()
        self.get_attention_mask = AttentionMask(config)
        self.word_embedding = EmbeddingLookup(config).set_comm_fusion(1)
        self.eod_reset = config.eod_reset
        if config.load_ckpt_path:
            # Loading the embedding table from the ckpt path:
            embedding_path = os.path.join(config.load_ckpt_path, 'position_embedding.npy')
            if os.path.exists(embedding_path):
                p_table = np.load(embedding_path)
                position_table_param = Tensor(p_table, mstype.float32)
            else:
                raise ValueError(f"{embedding_path} file not exits, please check whether position_embedding file exit.")
        else:
            position_table_param = TruncatedNormal(0.02)
            
        self.position_embedding = nn.Embedding(
            config.seq_length,
            config.embedding_size,
            embedding_table=position_table_param).set_comm_fusion(1)
        self.word_embedding.embedding_table.parallel_optimizer = False
        self.position_embedding.embedding_table.parallel_optimizer = False
        self.position_embedding.gather.shard(((1, 1), (config.dp,)))
        self.position_embedding.expand.shard(((config.dp, 1),))
        self.blocks = nn.CellList()
        fusion_group_num = 4
        fusion_group_size = config.num_layers // fusion_group_num
        fusion_group_size = max(fusion_group_size, 1)

        num_layers = config.num_layers - 1
        self.num_layers = num_layers

        for i in range(num_layers):
            per_block = Block(config, i + 1).set_comm_fusion(int(i / fusion_group_size) + 2)
            per_block.recompute()
            per_block.attention.dropout.dropout_gen_mask.recompute(False)
            per_block.attention.prob_dropout.dropout_gen_mask.recompute(False)
            per_block.output.dropout.dropout_gen_mask.recompute(False)
            per_block.attention.dropout.dropout_gen_mask.add_prim_attr("_side_effect", True)
            per_block.attention.prob_dropout.dropout_gen_mask.add_prim_attr("_side_effect", True)
            per_block.output.dropout.dropout_gen_mask.add_prim_attr("_side_effect", True)
            self.blocks.append(per_block)

        if config.self_layernorm:
            self.layernorm = LayerNorm((config.embedding_size,), config.dp).to_float(
                mstype.float32).set_comm_fusion(
                int((num_layers - 1) / fusion_group_size) + 2)
        else:
            self.layernorm = nn.LayerNorm((config.embedding_size,)).to_float(
                mstype.float32).set_comm_fusion(
                int((num_layers - 1) / fusion_group_size) + 2)
            self.layernorm.layer_norm.shard(((config.dp, 1, 1), (1,), (1,)))
        self.layernorm.gamma.parallel_optimizer = False
        self.layernorm.beta.parallel_optimizer = False
        self.use_past = config.use_past
        self.past = tuple([None] * config.num_layers)
        self.add = P.TensorAdd().shard(((config.dp, 1, 1), (config.dp, 1, 1)))
        self.expand_dims = P.ExpandDims().shard(((config.dp, 1, 1),))
        self.dtype = config.compute_dtype
        self.dropout = nn.Dropout(1 - config.dropout_rate)
        self.dropout.dropout_gen_mask.shard(((config.dp, 1, 1),))
        self.dropout.dropout_do_mask.shard(((config.dp, 1, 1),))

        if config.load_ckpt_path:
            # Loading the embedding table from the ckpt path:
            embedding_path = os.path.join(config.load_ckpt_path, 'top_query_embedding.npy')
            if os.path.exists(embedding_path):
                top_query_table = np.load(embedding_path)
                top_query_table_param = Tensor(top_query_table, mstype.float32)
            else:
                raise ValueError(f"{embedding_path} file not exits, please check whether top_query_embedding file exist.")
        else:
            top_query_table_param = TruncatedNormal(0.02)
            
        self.top_query_embedding = nn.Embedding(config.seq_length, config.embedding_size, \
                                                embedding_table=top_query_table_param).set_comm_fusion(
            int((config.num_layers - 1) / fusion_group_num) + 2)
        self.top_query_embedding.embedding_table.parallel_optimizer = False
        self.top_query_embedding.gather.shard(((1, 1), (config.dp,)))
        self.top_query_embedding.expand.shard(((config.dp, 1),))
        self.top_query_layer = QueryLayer(config)

        self.top_query_layer.recompute()

        self.top_query_layer.output.dropout.dropout_gen_mask.recompute(False)
        self.top_query_layer.attention.dropout.dropout_gen_mask.recompute(False)
        self.top_query_layer.attention.prob_dropout.dropout_gen_mask.recompute(False)

        self.top_query_layer.output.dropout.dropout_gen_mask.add_prim_attr("_side_effect", True)
        self.top_query_layer.attention.dropout.dropout_gen_mask.add_prim_attr("_side_effect", True)
        self.top_query_layer.attention.prob_dropout.dropout_gen_mask.add_prim_attr("_side_effect", True)

        self.top_query_layer.set_comm_fusion(int((config.num_layers - 1) / fusion_group_num) + 2)