def construct(self, input_ids, input_mask, token_type_id): sequence_output, _, _ = self.bert(input_ids, token_type_id, input_mask) batch_size, seq_length, hidden_size = P.Shape()(sequence_output) sequence = P.Reshape()(sequence_output, (-1, hidden_size)) logits = self.dense1(sequence) logits = P.Cast()(logits, self.dtype) logits = P.Reshape()(logits, (batch_size, seq_length, self.num_labels)) logits = self.log_softmax(logits) return logits
def __init__(self, config): super(CreateAttentionMaskFromInputMask, self).__init__() self.input_mask = None self.cast = P.Cast() self.reshape = P.Reshape() self.shape = (-1, 1, config.seq_length)
def __init__(self, is_training=True): super(CrossEntropyCalculation, self).__init__() self.onehot = P.OneHot() self.on_value = Tensor(1.0, ts.float32) self.off_value = Tensor(0.0, ts.float32) self.reduce_sum = P.ReduceSum() self.reduce_mean = P.ReduceMean() self.reshape = P.Reshape() self.last_idx = (-1, ) self.neg = P.Neg() self.cast = P.Cast() self.is_training = is_training
def __init__(self, length, max_relative_position): super(RelaPosMatrixGenerator, self).__init__() self._length = length self._max_relative_position = max_relative_position self._min_relative_position = -max_relative_position self.range_length = -length + 1 self.tile = P.Tile() self.range_mat = P.Reshape() self.sub = P.Sub() self.expanddims = P.ExpandDims() self.cast = P.Cast()
def __init__(self, config, is_training, num_labels=11, use_crf=False, dropout_prob=0.0, use_one_hot_embeddings=False): super(BertNERModel, self).__init__() if not is_training: config.hidden_dropout_prob = 0.0 config.hidden_probs_dropout_prob = 0.0 self.bert = Bert(config, is_training, use_one_hot_embeddings) self.cast = P.Cast() self.weight_init = TruncatedNormal(config.initializer_range) self.log_softmax = P.LogSoftmax(axis=-1) self.dtype = config.dtype self.num_labels = num_labels self.dense_1 = layers.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init, has_bias=True).to_float(config.compute_type) self.dropout = layers.Dropout(1 - dropout_prob) self.reshape = P.Reshape() self.shape = (-1, config.hidden_size) self.use_crf = use_crf self.origin_shape = (-1, config.seq_length, self.num_labels)
def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-6, weight_decay=0.0): super(AdamWeightDecayForBert, self).__init__(learning_rate, params, weight_decay) _check_param_value(beta1, beta2, eps, self.cls_name) self.beta1 = ts.array([beta1], dtype=ts.float32) self.beta2 = ts.array([beta2], dtype=ts.float32) self.eps = ts.array([eps], dtype=ts.float32) self.moments1 = self.parameters.clone(prefix="adam_m", init='zeros') self.moments2 = self.parameters.clone(prefix="adam_v", init='zeros') self.hyper_map = P.HyperMap() self.op_select = P.Select() self.op_cast = P.Cast() self.op_reshape = P.Reshape() self.op_shape = P.Shape()
def __init__(self, vocab_size, embedding_size, embedding_shape, use_one_hot_embeddings=False, initializer_range=0.02): super(EmbeddingLookup, self).__init__() self.vocab_size = vocab_size self.use_one_hot_embeddings = use_one_hot_embeddings self.embedding_table = Parameter(initializer (TruncatedNormal(initializer_range), [vocab_size, embedding_size])) self.expand = P.ExpandDims() self.shape_flat = (-1,) self.gather = P.Gather() self.one_hot = P.OneHot() self.on_value = Tensor(1.0, ts.float32) self.off_value = Tensor(0.0, ts.float32) self.array_mul = P.MatMul() self.reshape = P.Reshape() self.shape = tuple(embedding_shape)
def __init__(self, seq_length, hidden_size, num_attention_heads=12, attention_probs_dropout_prob=0.1, use_one_hot_embeddings=False, initializer_range=0.02, hidden_dropout_prob=0.1, use_relative_positions=False, compute_type=ts.float32): super(BertSelfAttention, self).__init__() if hidden_size % num_attention_heads != 0: raise ValueError("The hidden size (%d) is not a multiple of the number " "of attention heads (%d)" % (hidden_size, num_attention_heads)) self.size_per_head = int(hidden_size / num_attention_heads) self.attention = BertAttention( from_tensor_width=hidden_size, to_tensor_width=hidden_size, from_seq_length=seq_length, to_seq_length=seq_length, num_attention_heads=num_attention_heads, size_per_head=self.size_per_head, attention_probs_dropout_prob=attention_probs_dropout_prob, use_one_hot_embeddings=use_one_hot_embeddings, initializer_range=initializer_range, use_relative_positions=use_relative_positions, has_attention_mask=True, do_return_2d_tensor=True, compute_type=compute_type) self.output = BertOutput(in_channels=hidden_size, out_channels=hidden_size, initializer_range=initializer_range, dropout_prob=hidden_dropout_prob, compute_type=compute_type) self.reshape = P.Reshape() self.shape = (-1, hidden_size)
def __init__(self, length, depth, max_relative_position, initializer_range, use_one_hot_embeddings=False): super(RelaPosEmbeddingsGenerator, self).__init__() self.depth = depth self.vocab_size = max_relative_position * 2 + 1 self.use_one_hot_embeddings = use_one_hot_embeddings self.embeddings_table = Parameter( initializer(TruncatedNormal(initializer_range), [self.vocab_size, self.depth])) self.relative_positions_matrix = RelaPosMatrixGenerator(length=length, max_relative_position=max_relative_position) self.reshape = P.Reshape() self.one_hot = layers.OneHot(depth=self.vocab_size) self.shape = P.Shape() self.gather = P.Gather() # index_select self.matmul = P.BatchMatMul()
def __init__(self, embedding_size, embedding_shape, use_relative_positions=False, use_token_type=False, token_type_vocab_size=16, use_one_hot_embeddings=False, initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): super(EmbeddingPostprocessor, self).__init__() self.use_token_type = use_token_type self.token_type_vocab_size = token_type_vocab_size self.use_one_hot_embeddings = use_one_hot_embeddings self.max_position_embeddings = max_position_embeddings self.embedding_table = Parameter(initializer (TruncatedNormal(initializer_range), [token_type_vocab_size, embedding_size]), name='embedding_table') self.shape_flat = (-1,) self.one_hot = layers.OneHot() self.on_value = Tensor(1.0, ts.float32) self.off_value = Tensor(0.1, ts.float32) self.array_mul = P.MatMul() self.reshape = P.Reshape() self.shape = tuple(embedding_shape) self.layernorm = layers.LayerNorm((embedding_size,)) self.dropout = layers.Dropout(1 - dropout_prob) self.gather = P.Gather() self.use_relative_positions = use_relative_positions self.slice = P.StridedSlice() self.full_position_embeddings = Parameter(initializer (TruncatedNormal(initializer_range), [max_position_embeddings, embedding_size]), name='full_position_embeddings')
def __init__(self, hidden_size, seq_length, num_hidden_layers, num_attention_heads=12, intermediate_size=3072, attention_probs_dropout_prob=0.1, use_one_hot_embeddings=False, initializer_range=0.02, hidden_dropout_prob=0.1, use_relative_positions=False, hidden_act="gelu", compute_type=ts.float32, return_all_encoders=False): super(BertTransformer, self).__init__() self.return_all_encoders = return_all_encoders slayers = [] for _ in range(num_hidden_layers): layer = BertEncoderLayer(hidden_size=hidden_size, seq_length=seq_length, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, attention_probs_dropout_prob=attention_probs_dropout_prob, use_one_hot_embeddings=use_one_hot_embeddings, initializer_range=initializer_range, hidden_dropout_prob=hidden_dropout_prob, use_relative_positions=use_relative_positions, hidden_act=hidden_act, compute_type=compute_type) slayers.append(layer) self.layers = layers.LayerList(slayers) self.reshape = P.Reshape() self.shape = (-1, hidden_size) self.out_shape = (-1, seq_length, hidden_size)
def __init__(self, network, optimizer, scale_update_layer=None): super(BertFinetuneLayer, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = optimizer.parameters self.optimizer = optimizer self.optimizer.global_step = Parameter(initializer(0., [ 1, ]), name='global_step') self.grad = P.GradOperation(get_by_list=True, sens_param=True) self.allreduce = P.AllReduce() self.grad_reducer = None self.cast = P.Cast() self.gpu_target = False if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = P.FloatStatus() self.addn = P.AddN() self.reshape = P.Reshape() else: self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_before_grad = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.depend_parameter_use = P.Depend() self.base = Tensor(1, ts.float32) self.less_equal = P.LessEqual() self.hyper_map = P.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_layer if scale_update_layer: self.loss_scale = Parameter(Tensor( scale_update_layer.get_loss_scale(), dtype=ts.float32), name="loss_scale")
def __init__(self, from_tensor_width, to_tensor_width, from_seq_length, to_seq_length, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, has_attention_mask=False, attention_probs_dropout_prob=0.0, use_one_hot_embeddings=False, initializer_range=0.02, do_return_2d_tensor=False, use_relative_positions=False, compute_type=ts.float32): super(BertAttention, self).__init__() self.from_seq_length = from_seq_length self.to_seq_length = to_seq_length self.num_attention_heads = num_attention_heads self.size_per_head = size_per_head self.has_attention_mask = has_attention_mask self.use_relative_positions = use_relative_positions self.scores_mul = 1.0 / math.sqrt(float(self.size_per_head)) self.reshape = P.Reshape() self.shape_from_2d = (-1, from_tensor_width) self.shape_to_2d = (-1, to_tensor_width) weight = TruncatedNormal(initializer_range) units = num_attention_heads * size_per_head self.query_layer = layers.Dense(from_tensor_width, units, activation=query_act, weight_init=weight).to_float(compute_type) self.key_layer = layers.Dense(to_tensor_width, units, activation=key_act, weight_init=weight).to_float(compute_type) self.value_layer = layers.Dense(to_tensor_width, units, activation=value_act, weight_init=weight).to_float(compute_type) self.shape_from = (-1, from_seq_length, num_attention_heads, size_per_head) self.shape_to = (-1, to_seq_length, num_attention_heads, size_per_head) self.matmul_trans_b = P.BatchMatMul(transpose_b=True) self.multiply = P.Mul() self.transpose = P.Transpose() self.trans_shape = (0, 2, 1, 3) self.trans_shape_relative = (2, 0, 1, 3) self.trans_shape_position = (1, 2, 0, 3) self.multiply_data = -10000.0 self.matmul = P.BatchMatMul() self.softmax = layers.Softmax() self.dropout = layers.Dropout(1 - attention_probs_dropout_prob) if self.has_attention_mask: self.expand_dims = P.ExpandDims() self.sub = P.Sub() self.add = P.Add() self.cast = P.Cast() self.get_dtype = P.DType() if do_return_2d_tensor: self.shape_return = (-1, num_attention_heads * size_per_head) else: self.shape_return = (-1, from_seq_length, num_attention_heads * size_per_head) self.cast_compute_type = SaturateCast(dst_type=compute_type) if self.use_relative_positions: self._generate_relative_positions_embeddings = \ RelaPosEmbeddingsGenerator(length=to_seq_length, depth=size_per_head, max_relative_position=16, initializer_range=initializer_range, use_one_hot_embeddings=use_one_hot_embeddings)