def construct(self, input_ids, token_type_ids, input_mask): """Bidirectional Encoder Representations from Transformers.""" # embedding embedding_tables = self.bert_embedding_lookup.embedding_table word_embeddings = self.bert_embedding_lookup(input_ids) embedding_output = self.bert_embedding_postprocessor(token_type_ids, word_embeddings) # attention mask [batch_size, seq_length, seq_length] attention_mask = self._create_attention_mask_from_input_mask(input_mask) # bert encoder encoder_output = self.bert_encoder(self.cast_compute_type(embedding_output), attention_mask) sequence_output = self.cast(encoder_output[self.last_idx], self.dtype) # pooler batch_size = P.Shape()(input_ids)[0] sequence_slice = self.slice(sequence_output, (0, 0, 0), (batch_size, 1, self.hidden_size), (1, 1, 1)) first_token = self.squeeze_1(sequence_slice) pooled_output = self.dense(first_token) pooled_output = self.cast(pooled_output, self.dtype) return sequence_output, pooled_output, embedding_tables
def construct(self, input_ids, input_mask, token_type_id): sequence_output, _, _ = self.bert(input_ids, token_type_id, input_mask) batch_size, seq_length, hidden_size = P.Shape()(sequence_output) sequence = P.Reshape()(sequence_output, (-1, hidden_size)) logits = self.dense1(sequence) logits = P.Cast()(logits, self.dtype) logits = P.Reshape()(logits, (batch_size, seq_length, self.num_labels)) logits = self.log_softmax(logits) return logits
def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-6, weight_decay=0.0): super(AdamWeightDecayForBert, self).__init__(learning_rate, params, weight_decay) _check_param_value(beta1, beta2, eps, self.cls_name) self.beta1 = ts.array([beta1], dtype=ts.float32) self.beta2 = ts.array([beta2], dtype=ts.float32) self.eps = ts.array([eps], dtype=ts.float32) self.moments1 = self.parameters.clone(prefix="adam_m", init='zeros') self.moments2 = self.parameters.clone(prefix="adam_v", init='zeros') self.hyper_map = P.HyperMap() self.op_select = P.Select() self.op_cast = P.Cast() self.op_reshape = P.Reshape() self.op_shape = P.Shape()
def __init__(self, length, depth, max_relative_position, initializer_range, use_one_hot_embeddings=False): super(RelaPosEmbeddingsGenerator, self).__init__() self.depth = depth self.vocab_size = max_relative_position * 2 + 1 self.use_one_hot_embeddings = use_one_hot_embeddings self.embeddings_table = Parameter( initializer(TruncatedNormal(initializer_range), [self.vocab_size, self.depth])) self.relative_positions_matrix = RelaPosMatrixGenerator(length=length, max_relative_position=max_relative_position) self.reshape = P.Reshape() self.one_hot = layers.OneHot(depth=self.vocab_size) self.shape = P.Shape() self.gather = P.Gather() # index_select self.matmul = P.BatchMatMul()
def construct(self, *args): weights = self.weights loss = self.network(*args) sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens) grads = self.grad(self.network, weights)(*args, sens) return P.depend(loss, self.optimizer(grads))