def __call__(self): with distribute.data_placement_scope(): x = flow.data.megatron_gpt_mmap_data_loader( data_file_prefix=self.dataset, seq_length=self.seq_length, num_samples=self.num_samples, batch_size=self.batch_size, dtype=flow.int64, shuffle=True, random_seed=self.seed, split_sizes=self.split, split_index=0, nd_sbp=distribute.get_data_parallel_dist(), name=self.name, ) # embedding is on pipeline first stage with distribute.layer_placement_scope(0): data = flow.slice(x, begin=(None, 0), size=(None, self.seq_length)) # loss is on pipeline last stage with distribute.layer_placement_scope(-1): labels = flow.slice(x, begin=(None, 1), size=(None, self.seq_length)) return data, labels
def SQuAD( input_ids_blob, input_mask_blob, token_type_ids_blob, vocab_size, seq_length=512, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02, ): backbone = bert_util.BertBackbone( input_ids_blob=input_ids_blob, input_mask_blob=input_mask_blob, token_type_ids_blob=token_type_ids_blob, vocab_size=vocab_size, seq_length=seq_length, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, hidden_act=hidden_act, hidden_dropout_prob=hidden_dropout_prob, attention_probs_dropout_prob=attention_probs_dropout_prob, max_position_embeddings=max_position_embeddings, type_vocab_size=type_vocab_size, initializer_range=initializer_range, ) with flow.scope.namespace("cls-squad"): final_hidden = backbone.sequence_output() final_hidden_matrix = flow.reshape(final_hidden, [-1, hidden_size]) logits = bert_util._FullyConnected( final_hidden_matrix, hidden_size, units=2, weight_initializer=bert_util.CreateInitializer(initializer_range), name='output') logits = flow.reshape(logits, [-1, seq_length, 2]) start_logits = flow.slice(logits, [None, None, 0], [None, None, 1]) end_logits = flow.slice(logits, [None, None, 1], [None, None, 1]) return start_logits, end_logits
def query_key_value(self, h): """ Split input to q, k, v and split hidden states into heads, shape: (batch_size, seq_length, hidden_size) -> (batch_size, seq_length, num_attn_heads, head_size) -> (batch_size, num_attn_heads, seq_length, head_size) """ assert len(h.shape) == 3 # Note: 3 is between num_heads and head_size that ensure the features of heads of q, k, v is contiguously arranged new_shape = ( h.shape[0], h.shape[1], self.num_heads, 3 * self.head_size, ) if h.shape[0] == self.seq_length and h.shape[1] == self.batch_size: perm = [1, 2, 0, 3] elif h.shape[0] == self.batch_size and h.shape[1] == self.seq_length: perm = [0, 2, 1, 3] else: raise ValueError h = flow.reshape(h, new_shape) q, k, v = (flow.transpose( flow.slice( h, begin=[None, None, None, i * self.head_size], size=[None, None, None, self.head_size], ), perm=perm, ) for i in range(3)) return q, k, v
def train(x: flow.typing.Numpy.Placeholder( (args.global_batch_size, args.seq_length + 1), dtype=flow.int64)): x = distribute.input_data_parallel_cast(x) with distribute.layer_placement_scope(0): data = flow.slice(x, begin=(None, 0), size=(None, args.seq_length)) with distribute.layer_placement_scope(-1): labels = flow.slice(x, begin=(None, 1), size=(None, args.seq_length)) logits = model(data) losses = loss(logits, labels) optimizer.minimize(losses) losses = distribute.output_parallel_cast(losses) return {"loss": losses}
def PooledOutput(sequence_output, hidden_size, initializer_range): with flow.scope.namespace("bert-pooler"): first_token_tensor = flow.slice(sequence_output, [None, 0, 0], [None, 1, -1]) first_token_tensor = flow.reshape(first_token_tensor, [-1, hidden_size]) pooled_output = bert_util._FullyConnected( first_token_tensor, input_size=hidden_size, units=hidden_size, weight_initializer=bert_util.CreateInitializer(initializer_range), name="dense", ) pooled_output = flow.math.tanh(pooled_output) return pooled_output
def _EmbeddingPostprocessor( input_blob, seq_length, embedding_size, use_token_type=False, token_type_ids_blob=None, token_type_vocab_size=16, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1, ): output = input_blob if use_token_type: assert token_type_ids_blob is not None token_type_table = flow.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, embedding_size], dtype=input_blob.dtype, initializer=CreateInitializer(initializer_range), ) token_type_embeddings = flow.gather(params=token_type_table, indices=token_type_ids_blob, axis=0) output = output + token_type_embeddings if use_position_embeddings: position_table = flow.get_variable( name=position_embedding_name, shape=[1, max_position_embeddings, embedding_size], dtype=input_blob.dtype, initializer=CreateInitializer(initializer_range), ) assert seq_length <= max_position_embeddings if seq_length != max_position_embeddings: position_table = flow.slice(position_table, begin=[None, 0, 0], size=[None, seq_length, -1]) output = output + position_table output = _LayerNorm(output, embedding_size) output = _Dropout(output, dropout_prob) return output
def self_attn_qk_v_fw_bw( h: flow.typing.Numpy.Placeholder( shape=(seq_len, batch_size, hidden_size), dtype=flow.float32 ) ) -> typing.Tuple[flow.typing.Numpy, flow.typing.Numpy]: var = flow.get_variable( "var", shape=(1,), dtype=flow.float32, initializer=flow.constant_initializer(1.0, dtype=flow.float32), trainable=True, ) h = h * var if fused: flow.watch_diff(h, test_global_storage.Setter("h_grad_fused")) else: flow.watch_diff(h, test_global_storage.Setter("h_grad")) if fp16: h = flow.amp_white_identity(h) alpha = get_alpha(head_size) if fused: (qmk, v) = flow.nn.fused_self_attention_query_mul_key_and_value( h, head_size=head_size, alpha=alpha ) else: h = flow.reshape(h, (seq_len, batch_size, -1, 3 * head_size)) (q, k, v) = ( flow.transpose( flow.slice( h, begin=[None, None, None, head_size * i], size=[None, None, None, head_size], ), perm=[1, 2, 0, 3], ) for i in range(3) ) qmk = flow.matmul(q, k, transpose_b=True, alpha=alpha) h = flow.matmul(qmk, v) loss = flow.math.reduce_sum(h) flow.optimizer.SGD(get_lr_scheduler(), momentum=0).minimize(loss) return (qmk, v)