def __init__(self, length, depth, max_relative_position, initializer_range, use_one_hot_embeddings=False): super(RelaPosEmbeddingsGenerator, self).__init__() self.depth = depth self.vocab_size = max_relative_position * 2 + 1 self.use_one_hot_embeddings = use_one_hot_embeddings self.embeddings_table = Parameter( initializer(TruncatedNormal(initializer_range), [self.vocab_size, self.depth]), name='embeddings_for_position') self.relative_positions_matrix = RelaPosMatrixGenerator(length=length, max_relative_position=max_relative_position) self.reshape = ops.Reshape() self.one_hot = ops.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) self.shape = ops.Shape() self.gather = ops.GatherV2() # index_select self.matmul = ops.BatchMatMul()
def construct(self, s_t_hat, encoder_outputs, encoder_feature, enc_padding_mask, coverage): b, t_k, n = encoder_outputs.shape dec_fea = self.decode_proj(s_t_hat) # (B, 2 * hidden_dim) dec_fea_expand = P.ExpandDims()(dec_fea, 1) dec_fea_expand = P.BroadcastTo()(dec_fea_expand, (b, t_k, n)) att_features = encoder_feature + dec_fea_expand if self.is_coverage: coverage_input = coverage.view(-1, 1) # (B * t_k, 1) coverage_feature = self.W_c( coverage_input) # (B * t_k, 2 * hidden_dim) att_features = att_features + coverage_feature e = P.Tanh()(att_features) # (B * t_k, 2 * hidden_dim) scores = self.v(e) # (B * t_k, 1) scores = scores.view(-1, t_k) # (B, t_k) attn_dist_ = P.Softmax(1)(scores) * enc_padding_mask # (B, t_k) normalization_factor = P.ReduceSum(True)(attn_dist_, 1) attn_dist = attn_dist_ / normalization_factor attn_dist = P.ExpandDims()(attn_dist, 1) # (B, 1, t_k) c_t = P.BatchMatMul(attn_dist, encoder_outputs) # (B, 1, n) c_t = c_t.view(-1, self.hidden_dim * 2) # (B, 2 * hidden_dim) attn_dist = attn_dist.view(-1, t_k) if self.is_coverage: coverage = coverage.view(-1, t_k) coverage = coverage + attn_dist return c_t, attn_dist, coverage
def __init__(self, config): super(CreateAttentionMaskFromInputMask, self).__init__() self.input_mask_from_dataset = config.input_mask_from_dataset self.input_mask = None if not self.input_mask_from_dataset: self.input_mask = initializer( "ones", [config.batch_size, config.seq_length], mstype.int32).to_tensor() self.cast = ops.Cast() self.reshape = ops.Reshape() self.shape = (config.batch_size, 1, config.seq_length) self.broadcast_ones = initializer( "ones", [config.batch_size, config.seq_length, 1], mstype.float32).to_tensor() self.batch_matmul = ops.BatchMatMul()
def __init__(self, config, is_training, num_tokens, dropout_prob=0.0, use_one_hot_embeddings=False): super(BertPoetryModel, self).__init__() self.bert = BertModel(config, is_training, use_one_hot_embeddings) self.num_tokens = num_tokens idx = np.arange(config.seq_length) mask = idx[None, :] <= idx[:, None] self.mask = Tensor([mask], mstype.float32) self.MLM_Dense = nn.Dense(config.hidden_size, config.hidden_size,\ has_bias=True, weight_init=TruncatedNormal(0.02),\ activation='gelu').to_float(mstype.float16) self.layer_norm = nn.LayerNorm((config.hidden_size,)) self.matmul = ops.MatMul(transpose_b=True) self.biasadd = Parameter(initializer('zero', self.num_tokens), name='MLM_output_biasadd') self.softmax = ops.Softmax(axis=-1) self.seq_length = config.seq_length self.hidden_size = config.hidden_size self.cast = ops.Cast() self.reshape = ops.Reshape() self.batch_matmul = ops.BatchMatMul() ones = np.ones(shape=(config.batch_size, config.seq_length, config.seq_length)) self.lower_triangle_mask = Tensor(np.tril(ones), dtype=mstype.float32) self.multiply = ops.Mul()
def __init__(self, length, depth, max_relative_position, initializer_range, use_one_hot_embeddings=False): super(RelaPosEmbeddingsGenerator, self).__init__() self.depth = depth self.vocab_size = max_relative_position * 2 + 1 self.use_one_hot_embeddings = use_one_hot_embeddings self.embeddings_table = Parameter( initializer(TruncatedNormal(initializer_range), [self.vocab_size, self.depth])) self.relative_positions_matrix = RelaPosMatrixGenerator(length=length, max_relative_position=max_relative_position) self.reshape = P.Reshape() self.one_hot = nn.OneHot(depth=self.vocab_size) self.shape = P.Shape() self.gather = P.Gather() # index_select self.matmul = P.BatchMatMul()
def __init__(self, batch_size, from_tensor_width, to_tensor_width, from_seq_length, to_seq_length, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, has_attention_mask=False, attention_probs_dropout_prob=0.0, use_one_hot_embeddings=False, initializer_range=0.02, do_return_2d_tensor=False, use_relative_positions=False, compute_type=mstype.float32): super(BertAttention, self).__init__() self.batch_size = batch_size self.from_seq_length = from_seq_length self.to_seq_length = to_seq_length self.num_attention_heads = num_attention_heads self.size_per_head = size_per_head self.has_attention_mask = has_attention_mask self.use_relative_positions = use_relative_positions self.scores_mul = Tensor([1.0 / math.sqrt(float(self.size_per_head))], dtype=compute_type) self.reshape = ops.Reshape() self.shape_from_2d = (-1, from_tensor_width) self.shape_to_2d = (-1, to_tensor_width) weight = TruncatedNormal(initializer_range) units = num_attention_heads * size_per_head self.query_layer = nn.Dense(from_tensor_width, units, activation=query_act, weight_init=weight).to_float(compute_type) self.key_layer = nn.Dense(to_tensor_width, units, activation=key_act, weight_init=weight).to_float(compute_type) self.value_layer = nn.Dense(to_tensor_width, units, activation=value_act, weight_init=weight).to_float(compute_type) self.shape_from = (batch_size, from_seq_length, num_attention_heads, size_per_head) self.shape_to = ( batch_size, to_seq_length, num_attention_heads, size_per_head) self.matmul_trans_b = ops.BatchMatMul(transpose_b=True) self.multiply = ops.Mul() self.transpose = ops.Transpose() self.trans_shape = (0, 2, 1, 3) self.trans_shape_relative = (2, 0, 1, 3) self.trans_shape_position = (1, 2, 0, 3) #self.multiply_data = Tensor([-10000.0,], dtype=compute_type) self.multiply_data = Tensor([-10000.0,], dtype=mstype.float32) self.batch_num = batch_size * num_attention_heads self.matmul = ops.BatchMatMul() self.softmax = nn.Softmax() self.dropout = nn.Dropout(1 - attention_probs_dropout_prob) if self.has_attention_mask: self.expand_dims = ops.ExpandDims() self.sub = ops.Sub() self.add = ops.TensorAdd() self.cast = ops.Cast() self.get_dtype = ops.DType() if do_return_2d_tensor: self.shape_return = (batch_size * from_seq_length, num_attention_heads * size_per_head) else: self.shape_return = (batch_size, from_seq_length, num_attention_heads * size_per_head) self.cast_compute_type = SaturateCast(dst_type=compute_type) if self.use_relative_positions: self._generate_relative_positions_embeddings = \ RelaPosEmbeddingsGenerator(length=to_seq_length, depth=size_per_head, max_relative_position=16, initializer_range=initializer_range, use_one_hot_embeddings=use_one_hot_embeddings)
def __init__(self): super(ClickPredictor, self).__init__() self.matmul = ops.BatchMatMul()